From 5767035276d55aeca7fdb50e7ceaf09b72d0d62e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 13 Aug 2024 15:02:26 -0400
Subject: [PATCH 001/161] Initial dagster integration

---
 pyproject.toml                         |   3 +
 src/mozilla_sec_eia/assets.py          |  50 ++++++
 src/mozilla_sec_eia/ex_21/extractor.py |   3 -
 src/mozilla_sec_eia/extract.py         | 235 ++++++++++++++++---------
 src/mozilla_sec_eia/utils/cloud.py     |  90 +++++-----
 tests/conftest.py                      |  22 ---
 tests/unit/extract_test.py             |  64 +++++--
 tests/unit/utils_test.py               |  20 +--
 8 files changed, 310 insertions(+), 177 deletions(-)
 create mode 100644 src/mozilla_sec_eia/assets.py

diff --git a/pyproject.toml b/pyproject.toml
index ca30b8a..985f73e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,9 @@ license = {file = "LICENSE.txt"}
 dependencies = [
     "accelerate>=0.21.0,<1.0", # Hugging Face dependency for PyTorch models
     "cloud-sql-python-connector[pg8000]",
+    "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things
+    "dagster-mlflow",
+    "dagster-webserver",
     "datasets>=2.1,<3", # Access Hugging Face datasets
     "seqeval>=1.2,<2", # Sequence labeling evaluation
     "google-cloud-secret-manager>=2,<3",
diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/assets.py
new file mode 100644
index 0000000..e5656ac
--- /dev/null
+++ b/src/mozilla_sec_eia/assets.py
@@ -0,0 +1,50 @@
+"""Define asset jobs and configuration."""
+
+import logging
+
+import coloredlogs
+from dagster import Definitions, EnvVar, define_asset_job
+
+from mozilla_sec_eia.extract import ExtractConfig, basic_10k_extract, basic_10k_validate
+from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
+
+logger = logging.getLogger("catalystcoop")
+log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
+coloredlogs.install(fmt=log_format, logger=logger)
+
+extract_job = define_asset_job(
+    name="extract_job",
+    selection=[basic_10k_extract],
+)
+validate_job = define_asset_job(
+    name="validate_job",
+    selection=[basic_10k_validate],
+)
+
+cloud_interface = GCSArchive(
+    filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
+    labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
+    metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
+    user=EnvVar("GCS_IAM_USER"),
+    metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
+    project=EnvVar("GCS_PROJECT"),
+)
+
+defs = Definitions(
+    assets=[basic_10k_validate, basic_10k_extract],
+    jobs=[extract_job, validate_job],
+    resources={
+        "cloud_interface": cloud_interface,
+        "basic_10k_extract_config": ExtractConfig(),
+        "basic_10k_extract_mlflow": MlflowInterface(
+            experiment_name="basic_10k_extraction",
+            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+            cloud_interface=cloud_interface,
+        ),
+        "basic_10k_extract_validate_mlflow": MlflowInterface(
+            experiment_name="basic_10k_extraction_validation",
+            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+            cloud_interface=cloud_interface,
+        ),
+    },
+)
diff --git a/src/mozilla_sec_eia/ex_21/extractor.py b/src/mozilla_sec_eia/ex_21/extractor.py
index faf2dcb..8a3b617 100644
--- a/src/mozilla_sec_eia/ex_21/extractor.py
+++ b/src/mozilla_sec_eia/ex_21/extractor.py
@@ -27,7 +27,6 @@
 from transformers.data.data_collator import default_data_collator
 
 from mozilla_sec_eia.ex_21.create_labeled_dataset import format_as_ner_annotations
-from mozilla_sec_eia.utils.cloud import initialize_mlflow
 
 LABELS = ["O", "B-Subsidiary", "I-Subsidiary", "B-Loc", "I-Loc", "B-Own_Per"]
 
@@ -140,7 +139,6 @@ def log_model(finetuned_model: Trainer):
 
 def load_model():
     """Load fine-tuned model from mlflow artifacts."""
-    initialize_mlflow()
     return mlflow.transformers.load_model(
         "models:/layoutlm_extractor/1", return_type="components"
     )
@@ -160,7 +158,6 @@ def train_model(
         test_size: Proportion of labeled dataset to use for test set.
     """
     # Prepare mlflow for tracking/logging model
-    initialize_mlflow()
     mlflow.set_experiment("/finetune-layoutlmv3")
 
     # Prepare model
diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/extract.py
index d21c3f5..f4bf1bc 100644
--- a/src/mozilla_sec_eia/extract.py
+++ b/src/mozilla_sec_eia/extract.py
@@ -2,15 +2,18 @@
 
 import io
 import logging
+import tempfile
 from importlib import resources
+from pathlib import Path
 
 import mlflow
 import pandas as pd
 import pandera as pa
+from dagster import ConfigurableResource, asset
 from mlflow.entities import Run
 
 from mozilla_sec_eia import basic_10k
-from mozilla_sec_eia.utils.cloud import GCSArchive, initialize_mlflow
+from mozilla_sec_eia.utils.cloud import GCSArchive
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -40,6 +43,22 @@ def _log_artifact_as_csv(
     return mlflow.log_text(artifact.to_csv(index=index), artifact_name)
 
 
+def _load_artifact_as_parquet(run: Run, artifact_name: str) -> pd.DataFrame:
+    """Download a CSV and parse to DataFrame from mlflow tracking server."""
+    df = pd.read_parquet(run.info.artifact_uri + artifact_name)
+    return df
+
+
+def _log_artifact_as_parquet(
+    artifact: pd.DataFrame, artifact_name: str, index: bool = True
+):
+    """Upload a DataFrame as a CSV to mlflow tracking server."""
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        parquet_path = Path(tmp_dir) / artifact_name
+        artifact.to_parquet(parquet_path, index=index)
+        return mlflow.log_artifact(parquet_path, artifact_name)
+
+
 def _get_most_recent_run(experiment_name: str):
     """Search mlflow for most recent extraction run with specified experiment name."""
     run_metadata = mlflow.search_runs(experiment_names=[experiment_name])
@@ -76,7 +95,7 @@ def _get_filings_to_extract(
                 most_recent_run, "/extraction_metadata.csv"
             ).set_index("filename")
         )
-        extracted = _load_artifact_as_csv(most_recent_run, "/extracted.csv")
+        extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
         run_id = most_recent_run.info.run_id
 
     filings_to_extract = metadata[~metadata["filename"].isin(extraction_metadata.index)]
@@ -137,85 +156,17 @@ def compute_validation_metrics(
     }
 
 
-def validate_extraction(dataset: str):
-    """Run extraction on validation set and compare results to labeled data."""
-    validation_set = pd.read_csv(
-        resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv"
-    )
-
-    # Get metadata for labelled filings
-    archive = GCSArchive()
-    to_extract = archive.get_metadata(filenames=list(validation_set["filename"]))
-
-    # Extract data from filings
-    extracted = extract_filings(
-        dataset=dataset, metadata=to_extract, experiment_suffix="validation"
-    )
-    # Set index for validation set based on returned extracted DF
-    validation_set = validation_set.set_index(extracted.index.names)
-
-    # Get extraction run from mlflow and start again to log validation metrics
-    experiment_name = _get_experiment_name(dataset, experiment_suffix="validation")
-    run = _get_most_recent_run(experiment_name)
-    with mlflow.start_run(run_id=run.info.run_id):
-        # Compute metrics and log
-        if dataset == "basic_10k":
-            mlflow.log_metrics(
-                compute_validation_metrics(extracted, validation_set, "value")
-            )
-        # Log validation set used to compute metrics
-        _log_artifact_as_csv(validation_set, "labels.csv")
-
-
 def extract_filings(
     dataset: str,
-    continue_run: bool = False,
-    num_filings: int = -1,
-    metadata: pd.DataFrame | None = None,
-    experiment_suffix: str | None = None,
+    filings_to_extract: pd.DataFrame,
+    extraction_metadata: pd.DataFrame,
+    extracted: pd.DataFrame,
+    num_filings: int,
+    experiment_name: str,
+    cloud_interface: GCSArchive,
+    run_id: str | None = None,
 ) -> pd.DataFrame:
-    """Extra data from SEC 10k and exhibit 21 filings.
-
-    This function takes several parameters to decide which filings to extract data
-    from. If `continue_run` is set, it will search the mlflow tracking server for
-    the most recent extraction run for the specified dataset, and download corresponding
-    metadata and extraction results. It will then filter out any filings that were
-    already extracted in the run. This is useful for testing to be able perform extraction
-    on subsets of the data and continue where it left off. If `metadata` is passed
-    in, this is expected to be a selection of filing metadata that specifies exactly
-    which filings to extract. This is used for validation to only extract filings in
-    the validation set.
-
-    Args:
-        dataset: Data to extract, should be 'basic_10k' or 'ex21'.
-        continue_run: Whether to continue a previous extraction run.
-        num_filings: Number of filings to extract in run.
-        metadata: Specific selection of filing metadata to extract.
-        experiment_suffix: Add to mlflow run to differentiate run from basic extraction.
-    """
-    if dataset not in ["ex21", "basic_10k"]:
-        raise RuntimeError(
-            f"{dataset} is not a valid dataset. Must be 'ex21' or 'basic_10k'."
-        )
-
-    initialize_mlflow()
-
-    # Get filing metadata if not passed in explicitly
-    archive = GCSArchive()
-    if metadata is None:
-        metadata = archive.get_metadata()
-
-    experiment_name = _get_experiment_name(dataset, experiment_suffix=experiment_suffix)
-
-    # Get filings to extract as well as any existing metadata for run
-    filings_to_extract, extraction_metadata, extracted, run_id = (
-        _get_filings_to_extract(
-            experiment_name,
-            metadata,
-            continue_run=continue_run,
-            num_filings=num_filings,
-        )
-    )
+    """Extract filings in `filings_to_extract`."""
     mlflow.set_experiment(experiment_name)
     with mlflow.start_run(run_id=run_id):
         # Extract data for desired filings
@@ -224,7 +175,7 @@ def extract_filings(
                 filings_to_extract,
                 extraction_metadata,
                 extracted,
-                archive,
+                cloud_interface,
             )
         else:
             logger.warning("Exhibit 21 extraction is not yet implemented.")
@@ -234,14 +185,136 @@ def extract_filings(
         mlflow.log_metrics(
             {
                 "num_failed": (~extraction_metadata["success"]).sum(),
-                "ratio_extracted": len(extraction_metadata) / len(metadata),
+                "ratio_extracted": len(extraction_metadata) / num_filings,
             }
         )
 
         # Log the extraction results + metadata for future reference/analysis
         _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
-        _log_artifact_as_csv(extracted, "extracted.csv")
+        _log_artifact_as_parquet(extracted, "extracted.parquet")
     logger.info(
         f"Finished extracting {len(extraction_metadata)} filings from {dataset}."
     )
     return extracted
+
+
+class ExtractConfig(ConfigurableResource):
+    """Basic configuration for an extraction run."""
+
+    num_filings: int = -1
+
+
+def extract_asset_factory(dataset: str) -> asset:
+    """Produce asset to extract `dataset`."""
+
+    @asset(
+        name=f"{dataset}_extract",
+        required_resource_keys={
+            f"{dataset}_extract_config",
+            f"{dataset}_extract_mlflow",
+            "cloud_interface",
+        },
+    )
+    def extract(context) -> pd.DataFrame:
+        config = context.resources.original_resource_dict[f"{dataset}_extract_config"]
+        cloud_interface: GCSArchive = context.resources.cloud_interface
+        mlflow_interface = context.resources.original_resource_dict[
+            f"{dataset}_extract_mlflow"
+        ]
+        experiment_name = mlflow_interface.experiment_name
+        metadata = cloud_interface.get_metadata()
+
+        # Get filings to extract as well as any existing metadata for run
+        filings_to_extract, extraction_metadata, extracted, run_id = (
+            _get_filings_to_extract(
+                experiment_name,
+                metadata,
+                continue_run=mlflow_interface.continue_run,
+                num_filings=config.num_filings,
+            )
+        )
+
+        return extract_filings(
+            dataset=dataset,
+            filings_to_extract=filings_to_extract,
+            extraction_metadata=extraction_metadata,
+            extracted=extracted,
+            num_filings=len(metadata),
+            experiment_name=experiment_name,
+            cloud_interface=cloud_interface,
+            run_id=run_id,
+        )
+
+    return extract
+
+
+def validate_extraction(
+    dataset: str, experiment_name: str, cloud_interface: GCSArchive
+):
+    """Run extraction on validation set and compare results to labeled data."""
+    validation_set = pd.read_csv(
+        resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv"
+    )
+
+    # Get metadata for labelled filings
+    to_extract = cloud_interface.get_metadata(
+        filenames=list(validation_set["filename"])
+    )
+
+    # Get filings to extract as well as any existing metadata for run
+    filings_to_extract, extraction_metadata, extracted, run_id = (
+        _get_filings_to_extract(
+            experiment_name,
+            to_extract,
+        )
+    )
+
+    # Extract data from filings
+    extracted = extract_filings(
+        dataset=dataset,
+        filings_to_extract=filings_to_extract,
+        extraction_metadata=extraction_metadata,
+        extracted=extracted,
+        num_filings=len(to_extract),
+        experiment_name=experiment_name,
+        cloud_interface=cloud_interface,
+        run_id=run_id,
+    )
+
+    # Set index for validation set based on returned extracted DF
+    validation_set = validation_set.set_index(extracted.index.names)
+
+    # Get extraction run from mlflow and start again to log validation metrics
+    run = _get_most_recent_run(experiment_name)
+    with mlflow.start_run(run_id=run.info.run_id):
+        # Compute metrics and log
+        if dataset == "basic_10k":
+            mlflow.log_metrics(
+                compute_validation_metrics(extracted, validation_set, "value")
+            )
+        # Log validation set used to compute metrics
+        _log_artifact_as_csv(validation_set, "labels.csv")
+
+
+def validate_extraction_asset_factory(dataset: str):
+    """Create asset that extracts validation filings and compute validation metrics."""
+
+    @asset(
+        name=f"{dataset}_extract_validate",
+        required_resource_keys={
+            f"{dataset}_extract_validate_mlflow",
+            "cloud_interface",
+        },
+    )
+    def validate(context):
+        cloud_interface: GCSArchive = context.resources.cloud_interface
+        experiment_name = context.resources.original_resource_dict[
+            f"{dataset}_extract_validate_mlflow"
+        ].experiment_name
+        return validate_extraction(dataset, experiment_name, cloud_interface)
+
+    return validate
+
+
+basic_10k_extract = extract_asset_factory("basic_10k")
+basic_10k_validate = validate_extraction_asset_factory("basic_10k")
diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/utils/cloud.py
index 1d1866d..6753baa 100644
--- a/src/mozilla_sec_eia/utils/cloud.py
+++ b/src/mozilla_sec_eia/utils/cloud.py
@@ -11,13 +11,14 @@
 from typing import BinaryIO, TextIO
 
 import fitz
+import mlflow
 import pandas as pd
 import pg8000
+from dagster import ConfigurableResource
 from google.cloud import secretmanager, storage
 from google.cloud.sql.connector import Connector
 from PIL import Image
-from pydantic import BaseModel, Field, PrivateAttr
-from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import BaseModel, PrivateAttr
 from sqlalchemy import Engine, create_engine, select
 from sqlalchemy.orm import Session
 from xhtml2pdf import pisa
@@ -142,8 +143,8 @@ def from_file(
         )
 
 
-class GoogleCloudSettings(BaseSettings):
-    """Load environment variables to manage access to cloud resources.
+class GCSArchive(ConfigurableResource):
+    """Provides an interface for archived filings on GCS.
 
     This class looks for several environment variables to configure
     access to cloud resources. These can be set directly, or be in a
@@ -161,35 +162,23 @@ class GoogleCloudSettings(BaseSettings):
     MLFLOW_TRACKING_URI: URI of mlflow tracking server.
     """
 
-    model_config = SettingsConfigDict(env_file=".env")
-
-    filings_bucket_name: str = Field(validation_alias="GCS_FILINGS_BUCKET_NAME")
-    labels_bucket_name: str = Field(validation_alias="GCS_LABELS_BUCKET_NAME")
-    metadata_db_instance_connection: str = Field(
-        validation_alias="GCS_METADATA_DB_INSTANCE_CONNECTION"
-    )
-    user: str = Field(validation_alias="GCS_IAM_USER")
-    metadata_db_name: str = Field(validation_alias="GCS_METADATA_DB_NAME")
-    project: str = Field(validation_alias="GCS_PROJECT")
-    tracking_uri: str = Field(validation_alias="MLFLOW_TRACKING_URI")
-
-
-class GCSArchive(BaseModel):
-    """Provides an interface for archived filings on GCS."""
-
-    settings: GoogleCloudSettings = Field(default_factory=lambda: GoogleCloudSettings())
+    filings_bucket_name: str
+    labels_bucket_name: str
+    metadata_db_instance_connection: str
+    user: str
+    metadata_db_name: str
+    project: str
 
     _filings_bucket = PrivateAttr()
     _labels_bucket = PrivateAttr()
     _engine = PrivateAttr()
     _metadata_df = PrivateAttr(default=None)
 
-    def __init__(self, **kwargs):
+    def setup_for_execution(self, context):
         """Initialize interface to filings archive on GCS."""
-        super().__init__(**kwargs)
         self._engine = self._get_engine()
-        self._filings_bucket = self._get_bucket(self.settings.filings_bucket_name)
-        self._labels_bucket = self._get_bucket(self.settings.labels_bucket_name)
+        self._filings_bucket = self._get_bucket(self.filings_bucket_name)
+        self._labels_bucket = self._get_bucket(self.labels_bucket_name)
 
         Base.metadata.create_all(self._engine)
 
@@ -208,10 +197,10 @@ def _get_engine(self) -> Engine:
 
         def getconn() -> pg8000.dbapi.Connection:
             conn: pg8000.dbapi.Connection = connector.connect(
-                self.settings.metadata_db_instance_connection,
+                self.metadata_db_instance_connection,
                 "pg8000",
-                user=self.settings.user,
-                db=self.settings.metadata_db_name,
+                user=self.user,
+                db=self.metadata_db_name,
                 enable_iam_auth=True,
             )
             return conn
@@ -414,17 +403,34 @@ def _access_secret_version(secret_id: str, project_id: str, version_id="latest")
     return response.payload.data.decode("UTF-8")
 
 
-def initialize_mlflow(settings: GoogleCloudSettings | None = None):
-    """Set appropriate environment variables to prepare connection to tracking server."""
-    if settings is None:
-        settings = GoogleCloudSettings()
-
-    os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
-    os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version(
-        "mlflow_admin_password", settings.project
-    )
-    os.environ["MLFLOW_TRACKING_URI"] = settings.tracking_uri
-    os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
-    os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
-    os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
-    logger.info(f"Initialized tracking with mlflow server: {settings.tracking_uri}")
+class MlflowInterface(ConfigurableResource):
+    """Initialize interface to mlflow for desired experiment."""
+
+    experiment_name: str
+    continue_run: bool = False
+    tracking_uri: str
+    cloud_interface: GCSArchive
+    artifact_location: str | None = None
+
+    def setup_for_execution(self, context):
+        """Do runtime configuration of mlflow."""
+        os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
+        os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version(
+            "mlflow_admin_password", self.cloud_interface.project
+        )
+        os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri
+        os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
+        os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
+        os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
+        logger.info(f"Initialized tracking with mlflow server: {self.tracking_uri}")
+
+        self.create_experiment()
+
+    def create_experiment(self):
+        """Create experiment if it doesn't already exist."""
+        logger.info(f"Creating experiment: {self.experiment_name}")
+        if not mlflow.get_experiment_by_name(self.experiment_name):
+            mlflow.create_experiment(
+                name=self.experiment_name,
+                artifact_location=self.artifact_location,
+            )
diff --git a/tests/conftest.py b/tests/conftest.py
index 62c0058..6db667b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,11 +1,9 @@
 """PyTest configuration module. Defines useful fixtures, command line args."""
 
 import logging
-import unittest
 from pathlib import Path
 
 import pytest
-from mozilla_sec_eia.utils.cloud import GoogleCloudSettings, initialize_mlflow
 
 logger = logging.getLogger(__name__)
 
@@ -35,23 +33,3 @@ def test_dir() -> Path:
     Mostly this is meant as an example of a fixture.
     """
     return Path(__file__).parent
-
-
-@pytest.fixture
-def test_settings(test_dir):
-    """Return test GoogleCloudSettings object."""
-    return GoogleCloudSettings(_env_file=test_dir / "test.env")
-
-
-@pytest.fixture
-def test_mlflow_init_func(test_settings):
-    """Return a function that can replace ``initialize_mlflow`` with no external calls."""
-
-    def _test_init():
-        with unittest.mock.patch(
-            "mozilla_sec_eia.utils.cloud._access_secret_version",
-            new=lambda *args: "password",
-        ):
-            return initialize_mlflow(test_settings)
-
-    return _test_init
diff --git a/tests/unit/extract_test.py b/tests/unit/extract_test.py
index 63b59cb..4441c2b 100644
--- a/tests/unit/extract_test.py
+++ b/tests/unit/extract_test.py
@@ -1,15 +1,20 @@
 """Test extraction tools/methods."""
 
+import logging
 import unittest
 
-import mlflow
 import pandas as pd
 import pytest
+from dagster import build_asset_context
 from mozilla_sec_eia.extract import (
+    ExtractConfig,
     _get_most_recent_run,
+    basic_10k_extract,
     compute_validation_metrics,
-    extract_filings,
 )
+from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
 @pytest.fixture
@@ -58,7 +63,6 @@ def second_run_results():
 
 
 def test_extract_basic_10k(
-    test_mlflow_init_func,
     filings_metadata,
     first_run_results,
     second_run_results,
@@ -66,30 +70,54 @@ def test_extract_basic_10k(
 ):
     """Test high level extraction workflow."""
 
-    class FakeArchive:
+    class FakeArchive(GCSArchive):
+        filings_bucket_name: str = ""
+        labels_bucket_name: str = ""
+        metadata_db_instance_connection: str = ""
+        user: str = ""
+        metadata_db_name: str = ""
+        project: str = ""
+
+        def setup_for_execution(self, context):
+            pass
+
         def get_metadata(self):
             return filings_metadata
 
-    with (
-        unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"),
-        unittest.mock.patch("mozilla_sec_eia.extract.GCSArchive", new=FakeArchive),
-    ):
-        # Initialize mlflow with test settings
-        test_mlflow_init_func()
-        mlflow.create_experiment(
-            "basic_10k_extraction", artifact_location=str(tmp_path)
-        )
+    # Initialize mlflow with test settings
+    experiment_name = "basic_10k_extract_unit_test"
 
+    with unittest.mock.patch(
+        "mozilla_sec_eia.utils.cloud._access_secret_version", new=lambda *args: ""
+    ):
         for i, results in enumerate([first_run_results, second_run_results]):
-            kwargs = {"num_filings": 3} if i == 0 else {"continue_run": True}
-            with unittest.mock.patch(
-                "mozilla_sec_eia.extract.basic_10k.extract", new=lambda *args: results
+            logger.info(f"Run {i} of basic 10k extraction.")
+            with (
+                build_asset_context(
+                    resources={
+                        "basic_10k_extract_config": ExtractConfig(
+                            num_filings=3 if i == 0 else -1
+                        ),
+                        "basic_10k_extract_mlflow": MlflowInterface(
+                            experiment_name=experiment_name,
+                            continue_run=i > 0,
+                            tracking_uri="sqlite:///:memory:",
+                            cloud_interface=FakeArchive(),
+                            artifact_location=str(tmp_path),
+                        ),
+                        "cloud_interface": FakeArchive(),
+                    }
+                ) as context,
+                unittest.mock.patch(
+                    "mozilla_sec_eia.extract.basic_10k.extract",
+                    new=lambda *args: results,
+                ),
             ):
                 metadata = results[0]
 
                 # Run extract method
-                extract_filings("basic_10k", **kwargs)
-                run = _get_most_recent_run("basic_10k_extraction")
+                basic_10k_extract(context)
+                run = _get_most_recent_run(experiment_name)
                 assert run.data.metrics["num_failed"] == (~metadata["success"]).sum()
                 assert run.data.metrics["ratio_extracted"] == len(metadata) / len(
                     filings_metadata
diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py
index 1907de4..b935c8f 100644
--- a/tests/unit/utils_test.py
+++ b/tests/unit/utils_test.py
@@ -9,7 +9,6 @@
 from mozilla_sec_eia.utils.cloud import (
     Exhibit21,
     GCSArchive,
-    GoogleCloudSettings,
     Sec10K,
 )
 
@@ -21,17 +20,16 @@ def test_archive():
         unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_engine"),
         unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_bucket"),
     ):
-        return GCSArchive(
-            settings=GoogleCloudSettings(
-                GCS_FILINGS_BUCKET_NAME="filings_bucket_name",
-                GCS_LABELS_BUCKET_NAME="labels_bucket_name",
-                GCS_METADATA_DB_INSTANCE_CONNECTION="metadata_db_instance_connection",
-                GCS_IAM_USER="user",
-                GCS_METADATA_DB_NAME="metadata_db_name",
-                GCS_PROJECT="project_name",
-                MLFLOW_TRACKING_URI="http://tracking.server",
-            )
+        archive = GCSArchive(
+            filings_bucket_name="filings_bucket_name",
+            labels_bucket_name="labels_bucket_name",
+            metadata_db_instance_connection="metadata_db_instance_connection",
+            user="user",
+            metadata_db_name="metadata_db_name",
+            project="project_name",
         )
+        archive.setup_for_execution("fake_context")
+        return archive
 
 
 @dataclass

From 9d9fbfd409057bc3c9fe9c8d730d1b3e40d060d0 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 14 Aug 2024 10:02:34 -0400
Subject: [PATCH 002/161] Update validate integration test to dagster infra

---
 tests/integration/extract_test.py | 38 +++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/tests/integration/extract_test.py b/tests/integration/extract_test.py
index 7414459..495488c 100644
--- a/tests/integration/extract_test.py
+++ b/tests/integration/extract_test.py
@@ -1,22 +1,40 @@
 """Validate basic 10k and exhibit 21 extraction."""
 
-import unittest
-
+import dotenv
+from dagster import EnvVar, build_asset_context
 from mozilla_sec_eia.extract import (
-    _get_experiment_name,
     _get_most_recent_run,
-    validate_extraction,
+    basic_10k_validate,
 )
+from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
 
 
-def test_basic_10k_extraction(test_mlflow_init_func):
+def test_basic_10k_extraction():
     """Run full 10k extraction on validation set and verify desired metrics are met."""
-    with unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"):
-        test_mlflow_init_func()
-        validate_extraction("basic_10k")
-    run = _get_most_recent_run(
-        _get_experiment_name("basic_10k", experiment_suffix="validation")
+    dotenv.load_dotenv()
+    experiment_name = "basic_10k_validate_test"
+    cloud_interface = GCSArchive(
+        filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
+        labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
+        metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
+        user=EnvVar("GCS_IAM_USER"),
+        metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
+        project=EnvVar("GCS_PROJECT"),
     )
 
+    with build_asset_context(
+        resources={
+            "basic_10k_extract_validate_mlflow": MlflowInterface(
+                experiment_name=experiment_name,
+                continue_run=False,
+                tracking_uri="sqlite:///:memory:",
+                cloud_interface=cloud_interface,
+            ),
+            "cloud_interface": cloud_interface,
+        }
+    ) as context:
+        basic_10k_validate(context)
+    run = _get_most_recent_run(experiment_name)
+
     assert run.data.metrics["precision"] == 1
     assert run.data.metrics["recall"] == 1

From ee77e7a975da0030867538b9db685bae54f07863 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 27 Aug 2024 13:56:05 -0400
Subject: [PATCH 003/161] Generalize mltools

---
 src/mozilla_sec_eia/assets.py                 |  74 +--
 src/mozilla_sec_eia/basic_10k.py              |  30 +-
 src/mozilla_sec_eia/cli.py                    |  90 ----
 src/mozilla_sec_eia/extract.py                | 461 ++++--------------
 src/mozilla_sec_eia/utils/cloud.py            |  52 +-
 .../utils/ml_tools/__init__.py                |  13 +
 .../utils/ml_tools/experiment_tracking.py     | 199 ++++++++
 src/mozilla_sec_eia/utils/ml_tools/models.py  | 162 ++++++
 8 files changed, 479 insertions(+), 602 deletions(-)
 create mode 100644 src/mozilla_sec_eia/utils/ml_tools/__init__.py
 create mode 100644 src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py
 create mode 100644 src/mozilla_sec_eia/utils/ml_tools/models.py

diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/assets.py
index 8efd199..748690d 100644
--- a/src/mozilla_sec_eia/assets.py
+++ b/src/mozilla_sec_eia/assets.py
@@ -3,81 +3,15 @@
 import logging
 
 import coloredlogs
-from dagster import Definitions, EnvVar, define_asset_job
+from dagster import Definitions
 
-from mozilla_sec_eia.ex_21.train_extractor import train_model
-from mozilla_sec_eia.extract import (
-    ExtractConfig,
-    basic_10k_extract,
-    basic_10k_validate,
-    ex21_extract,
-    ex21_validate,
-)
-from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
+from mozilla_sec_eia.utils import ml_tools
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
-extract_job = define_asset_job(
-    name="extract_job",
-    selection=[basic_10k_extract, ex21_extract],
-)
-validate_job = define_asset_job(
-    name="validate_job",
-    selection=[basic_10k_validate, ex21_validate],
-)
-finetune_model_job = define_asset_job(
-    name="finetune_model_job",
-    selection=[train_model],
-)
-
-cloud_interface = GCSArchive(
-    filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
-    labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
-    metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
-    user=EnvVar("GCS_IAM_USER"),
-    metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
-    project=EnvVar("GCS_PROJECT"),
-)
-
 defs = Definitions(
-    assets=[
-        basic_10k_validate,
-        basic_10k_extract,
-        ex21_validate,
-        ex21_extract,
-        train_model,
-    ],
-    jobs=[extract_job, validate_job, finetune_model_job],
-    resources={
-        "cloud_interface": cloud_interface,
-        "basic_10k_extract_config": ExtractConfig(),
-        "ex21_extract_config": ExtractConfig(),
-        "basic_10k_extract_mlflow": MlflowInterface(
-            experiment_name="basic_10k_extraction",
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            cloud_interface=cloud_interface,
-        ),
-        "ex21_extract_mlflow": MlflowInterface(
-            experiment_name="basic_10k_extraction",
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            cloud_interface=cloud_interface,
-        ),
-        "basic_10k_extract_validate_mlflow": MlflowInterface(
-            experiment_name="basic_10k_extraction_validation",
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            cloud_interface=cloud_interface,
-        ),
-        "ex21_extract_validate_mlflow": MlflowInterface(
-            experiment_name="basic_10k_extraction_validation",
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            cloud_interface=cloud_interface,
-        ),
-        "layoutlm_mlflow_interface": MlflowInterface(
-            experiment_name="/finetune-layoutlmv3",
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            cloud_interface=cloud_interface,
-        ),
-    },
+    jobs=ml_tools.get_ml_model_jobs(),
+    resources=ml_tools.get_ml_model_resources(),
 )
diff --git a/src/mozilla_sec_eia/basic_10k.py b/src/mozilla_sec_eia/basic_10k.py
index 9e0786e..cf04ada 100644
--- a/src/mozilla_sec_eia/basic_10k.py
+++ b/src/mozilla_sec_eia/basic_10k.py
@@ -1,9 +1,9 @@
 """Implement functions for handling data from basic 10k filings (not exhibit 21)."""
 
 import logging
-from concurrent.futures import ProcessPoolExecutor
 
 import pandas as pd
+from dagster import Out, op
 
 from mozilla_sec_eia.utils.cloud import GCSArchive, Sec10K
 
@@ -67,11 +67,10 @@ def _extract_10k(filing: Sec10K):
     return pd.DataFrame(values), filing.filename, unmatched_keys
 
 
+@op(out={"extraction_metadata": Out(), "extracted": Out()})
 def extract(
+    cloud_interface: GCSArchive,
     filings_to_extract: pd.DataFrame,
-    extraction_metadata: pd.DataFrame,
-    extracted: pd.DataFrame,
-    archive: GCSArchive,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract basic 10K data and write to postgres table.
 
@@ -81,15 +80,20 @@ def extract(
     """
     logger.info("Starting basic 10K extraction.")
     logger.info(f"Extracting {len(filings_to_extract)} filings.")
-    with ProcessPoolExecutor() as executor:
-        for ext, filename, unmatched_keys in executor.map(
-            _extract_10k, archive.iterate_filings(filings_to_extract)
-        ):
-            extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
-                len(ext) > 0,
-                ",".join(unmatched_keys),
-            ]
-            extracted = pd.concat([extracted, ext])
+
+    extraction_metadata = pd.DataFrame(
+        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+    ).set_index("filename")
+    extracted = pd.DataFrame()
+
+    for filing in cloud_interface.iterate_filings(filings_to_extract):
+        ext, filename, unmatched_keys = _extract_10k(filing)
+        extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
+            len(ext) > 0,
+            ",".join(unmatched_keys),
+        ]
+        extracted = pd.concat([extracted, ext])
+
     return (
         extraction_metadata,
         extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]),
diff --git a/src/mozilla_sec_eia/cli.py b/src/mozilla_sec_eia/cli.py
index 5b38a0a..2d4a84a 100755
--- a/src/mozilla_sec_eia/cli.py
+++ b/src/mozilla_sec_eia/cli.py
@@ -4,93 +4,3 @@
 to add new scripts which can be accessed through one top-level
 interface.
 """
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-
-import coloredlogs
-
-from mozilla_sec_eia.ex_21.create_labeled_dataset import (
-    create_inputs_for_label_studio,
-)
-from mozilla_sec_eia.ex_21.rename_labeled_filings import rename_filings
-from mozilla_sec_eia.ex_21.train_extractor import train_model
-from mozilla_sec_eia.extract import extract_filings, validate_extraction
-from mozilla_sec_eia.utils import GCSArchive
-
-# This is the module-level logger, for any logs
-logger = logging.getLogger(__name__)
-ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
-
-
-def parse_command_line(argv: list[str]) -> argparse.Namespace:
-    """Parse command line arguments. See the -h option for details.
-
-    Args:
-        argv (str): Command line arguments, including caller filename.
-
-    Returns:
-        dict: Dictionary of command line arguments and their parsed values.
-
-    """
-
-    def formatter(prog) -> argparse.HelpFormatter:
-        """This is a hack to create HelpFormatter with a particular width."""
-        return argparse.HelpFormatter(prog, width=88)
-
-    # Use the module-level docstring as the script's description in the help message.
-    parser = argparse.ArgumentParser(description=__doc__, formatter_class=formatter)
-    subparsers = parser.add_subparsers(required=True)
-
-    # Add command to validate filing archive contents
-    validate_parser = subparsers.add_parser("validate_archive")
-    validate_parser.set_defaults(func=lambda: GCSArchive().validate_archive())
-
-    # Add command to fine-tune ex21 extractor
-    validate_parser = subparsers.add_parser("finetune_ex21")
-    validate_parser.add_argument("--labeled-json-path")
-    validate_parser.add_argument("--gcs-training-data-dir", default="labeled/")
-    validate_parser.add_argument("--model-output-dir", default="layoutlm_trainer")
-    validate_parser.add_argument("--test-size", default=0.2)
-    validate_parser.set_defaults(func=train_model)
-
-    # Add command to rename labeled filings on GCS
-    validate_parser = subparsers.add_parser("rename_filings")
-    validate_parser.set_defaults(func=rename_filings)
-
-    # Add command to extract basic 10k data
-    extract_parser = subparsers.add_parser("extract")
-    extract_parser.add_argument("--dataset", nargs=1, default="basic_10k")
-    extract_parser.add_argument("--continue-run", action="store_true", default=False)
-    extract_parser.add_argument("--num-filings", default=-1, nargs="?", type=int)
-    extract_parser.set_defaults(func=extract_filings)
-
-    validate_extract_parser = subparsers.add_parser("validate")
-    validate_extract_parser.add_argument("--dataset", nargs=1, default="basic_10k")
-    validate_extract_parser.set_defaults(func=validate_extraction)
-
-    # Add command to create Label Studio inputs from cached Ex. 21 images and PDFs
-    validate_parser = subparsers.add_parser("create_ls_inputs")
-    validate_parser.add_argument("--pdfs-dir", default=ROOT_DIR / "sec10k_filings/pdfs")
-    validate_parser.add_argument("--cache-dir", default=ROOT_DIR / "sec10k_filings")
-    validate_parser.set_defaults(func=create_inputs_for_label_studio)
-
-    arguments = parser.parse_args(argv[1:])
-
-    return arguments
-
-
-def main() -> int:
-    """Demonstrate a really basic command line interface (CLI) that takes arguments."""
-    logger = logging.getLogger("catalystcoop")
-    log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
-    coloredlogs.install(fmt=log_format, logger=logger)
-
-    args = parse_command_line(sys.argv)
-    return args.func(**{key: val for key, val in vars(args).items() if key != "func"})
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/extract.py
index 44d598f..90d87ba 100644
--- a/src/mozilla_sec_eia/extract.py
+++ b/src/mozilla_sec_eia/extract.py
@@ -2,25 +2,31 @@
 
 import io
 import logging
+import math
 import tempfile
-from importlib import resources
 from pathlib import Path
 
 import mlflow
+import numpy as np
 import pandas as pd
 import pandera as pa
-from dagster import ConfigurableResource, asset
+from dagster import (
+    Config,
+    DynamicOut,
+    DynamicOutput,
+    GraphDefinition,
+    OpDefinition,
+    graph,
+    op,
+)
 from mlflow.entities import Run
 
 from mozilla_sec_eia import basic_10k
-from mozilla_sec_eia.ex_21.inference import clean_extracted_df, perform_inference
-from mozilla_sec_eia.utils.cloud import (
-    GCSArchive,
-    get_metadata_filename,
-)
-from mozilla_sec_eia.utils.layoutlm import (
-    load_model,
+from mozilla_sec_eia.utils.cloud import GCSArchive
+from mozilla_sec_eia.utils.ml_tools.experiment_tracking import (
+    get_tracking_resource_name,
 )
+from mozilla_sec_eia.utils.ml_tools.models import pudl_model
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -68,61 +74,81 @@ def _log_artifact_as_parquet(
         return mlflow.log_artifact(parquet_path, artifact_name)
 
 
-def _get_most_recent_run(experiment_name: str):
-    """Search mlflow for most recent extraction run with specified experiment name."""
-    run_metadata = mlflow.search_runs(experiment_names=[experiment_name])
+@op
+def get_filings_to_extract(
+    cloud_interface: GCSArchive,
+) -> pd.DataFrame:
+    """Return filing metadata."""
+    return cloud_interface.get_metadata()
+
+
+class ChunkFilingsConfig(Config):
+    """Config how many filings are extracted and chunk_size for extraction."""
 
-    # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
-    # This assert will ensure this doesn't silently break if the ordering changes
-    assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max()
-    return mlflow.get_run(run_metadata.loc[0, "run_id"])
+    chunk_size: int = 1000
+    num_filings: int = -1
 
 
-def _get_filings_to_extract(
-    experiment_name: str,
+@op(out=DynamicOut())
+def chunk_filings(
+    config: ChunkFilingsConfig,
     metadata: pd.DataFrame,
-    continue_run: bool = False,
-    num_filings: int = -1,
-):
-    """Get filings that should be extracted by run.
+) -> pd.DataFrame:
+    """Split filings into chunks for parallel extraction."""
+    filings_to_extract = metadata
+    if config.num_filings > 0:
+        filings_to_extract = filings_to_extract.sample(config.num_filings)
+
+    for i, chunk in enumerate(
+        np.array_split(
+            filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size)
+        )
+    ):
+        yield DynamicOutput(chunk, mapping_key=str(i))
 
-    Args:
-        experiment_name: Name of mlflow experiment.
-        metadata: Metadata for full set of filings to potentially extract.
-        continue_run: Whether to continue a previous extraction run.
-        num_filings: Number of filings to extract.
-    """
-    extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-    ).set_index("filename")
-    extracted = pd.DataFrame()
-    run_id = None
-    if continue_run:
-        most_recent_run = _get_most_recent_run(experiment_name)
-        extraction_metadata = ExtractionMetadataSchema.validate(
-            _load_artifact_as_csv(
-                most_recent_run, "/extraction_metadata.csv"
-            ).set_index("filename")
+
+def extract_model_factory(
+    dataset_name: str, extract_op: OpDefinition | GraphDefinition
+):
+    """Produce a `pudl_model` to extract data from sec10k filings."""
+    experiment_name = f"{dataset_name}_extraction"
+    experiment_tracker_resource = get_tracking_resource_name(experiment_name)
+
+    @op(required_resource_keys=[experiment_tracker_resource])
+    def log_extraction_data(
+        metadata: pd.DataFrame,
+        extraction_metadata: list[pd.DataFrame],
+        extracted: list[pd.DataFrame],
+    ):
+        extraction_metadata = pd.concat(extraction_metadata)
+        extracted = pd.concat(extracted)
+        # Use metadata to log generic metrics
+        extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
+        mlflow.log_metrics(
+            {
+                "num_failed": (~extraction_metadata["success"]).sum(),
+                "ratio_extracted": len(extraction_metadata) / len(metadata),
+            }
         )
-        extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
-        run_id = most_recent_run.info.run_id
-
-    filings_to_extract = metadata[~metadata["filename"].isin(extraction_metadata.index)]
-    if num_filings > 0:
-        filings_to_extract = filings_to_extract.sample(num_filings)
-    return (
-        filings_to_extract,
-        extraction_metadata,
-        extracted,
-        run_id,
-    )
 
+        # Log the extraction results + metadata for future reference/analysis
+        _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
+        _log_artifact_as_parquet(extracted, "extracted.parquet")
+
+    @pudl_model(experiment_name=experiment_name)
+    @graph(name=experiment_name)
+    def extract_filings():
+        filings_to_extract = get_filings_to_extract()
+        filing_chunks = chunk_filings(filings_to_extract)
+        extraction_metadata, extracted = filing_chunks.map(extract_op)
+
+        return log_extraction_data(
+            filings_to_extract,
+            extraction_metadata.collect(),
+            extracted.collect(),
+        )
 
-def _get_experiment_name(dataset: str, experiment_suffix: str | None = None) -> str:
-    experiment_name = f"{dataset}_extraction"
-    if experiment_suffix is not None:
-        experiment_name += f"_{experiment_suffix}"
-    return experiment_name
+    return extract_filings
 
 
 def compute_validation_metrics(
@@ -165,325 +191,4 @@ def compute_validation_metrics(
     }
 
 
-def extract_filings(
-    dataset: str,
-    filings_to_extract: pd.DataFrame,
-    extraction_metadata: pd.DataFrame,
-    extracted: pd.DataFrame,
-    num_filings: int,
-    experiment_name: str,
-    cloud_interface: GCSArchive,
-    run_id: str | None = None,
-) -> pd.DataFrame:
-    """Extract filings in `filings_to_extract`."""
-    mlflow.set_experiment(experiment_name)
-    with mlflow.start_run(run_id=run_id):
-        # Extract data for desired filings
-        if dataset == "basic_10k":
-            extraction_metadata, extracted = basic_10k.extract(
-                filings_to_extract,
-                extraction_metadata,
-                extracted,
-                cloud_interface,
-            )
-        else:
-            model_checkpoint = load_model()
-            model = model_checkpoint["model"]
-            processor = model_checkpoint["tokenizer"]
-            # populate extraction metadata with filenames
-            # TODO: does extraction md already have filenames in it? check this
-            extraction_metadata = pd.concat(
-                [
-                    extraction_metadata,
-                    pd.DataFrame(
-                        {
-                            "filename": filings_to_extract["filename"].unique(),
-                            "success": False,
-                        }
-                    ).set_index("filename"),
-                ]
-            )
-            # TODO: there's probably a faster way to do this with less caching
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # get Sec10K objects
-                # TODO: does it save time if we don't cache them?
-                temp_dir = Path(temp_dir)
-                cloud_interface.get_filings(
-                    filings_to_extract, cache_directory=temp_dir, cache_pdf=True
-                )
-                _, _, extracted, extraction_metadata = perform_inference(
-                    pdfs_dir=temp_dir,
-                    model=model,
-                    processor=processor,
-                    extraction_metadata=extraction_metadata,
-                )
-            extracted["filename"] = extracted["id"].apply(get_metadata_filename)
-            extracted = extracted.set_index("filename")
-
-        # Use metadata to log generic metrics
-        extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
-        mlflow.log_metrics(
-            {
-                "num_failed": (~extraction_metadata["success"]).sum(),
-                "ratio_extracted": len(extraction_metadata) / num_filings,
-            }
-        )
-
-        # Log the extraction results + metadata for future reference/analysis
-        _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
-        _log_artifact_as_parquet(extracted, "extracted.parquet")
-    logger.info(
-        f"Finished extracting {len(extraction_metadata)} filings from {dataset}."
-    )
-    return extracted
-
-
-class ExtractConfig(ConfigurableResource):
-    """Basic configuration for an extraction run."""
-
-    num_filings: int = -1
-
-
-def extract_asset_factory(dataset: str) -> asset:
-    """Produce asset to extract `dataset`."""
-
-    @asset(
-        name=f"{dataset}_extract",
-        required_resource_keys={
-            f"{dataset}_extract_config",
-            f"{dataset}_extract_mlflow",
-            "cloud_interface",
-        },
-    )
-    def extract(context) -> pd.DataFrame:
-        config = context.resources.original_resource_dict[f"{dataset}_extract_config"]
-        cloud_interface: GCSArchive = context.resources.cloud_interface
-        mlflow_interface = context.resources.original_resource_dict[
-            f"{dataset}_extract_mlflow"
-        ]
-        experiment_name = mlflow_interface.experiment_name
-        metadata = cloud_interface.get_metadata()
-
-        # Get filings to extract as well as any existing metadata for run
-        filings_to_extract, extraction_metadata, extracted, run_id = (
-            _get_filings_to_extract(
-                experiment_name,
-                metadata,
-                continue_run=mlflow_interface.continue_run,
-                num_filings=config.num_filings,
-            )
-        )
-
-        return extract_filings(
-            dataset=dataset,
-            filings_to_extract=filings_to_extract,
-            extraction_metadata=extraction_metadata,
-            extracted=extracted,
-            num_filings=len(metadata),
-            experiment_name=experiment_name,
-            cloud_interface=cloud_interface,
-            run_id=run_id,
-        )
-
-    return extract
-
-
-def jaccard_similarity(
-    computed_df: pd.DataFrame, validation_df: pd.DataFrame, value_col: str
-) -> float:
-    """Get the Jaccard similarity between two Series.
-
-    Calculated as the intersection of the set divided
-    by the union of the set.
-
-    Args:
-        computed_df: Extracted data.
-        validation_df: Expected extraction results.
-        value_col: Column to calculate Jaccard similarity on.
-            Must be present in both dataframes.
-    """
-    # fill nans to make similarity comparison more accurate
-    if (computed_df[value_col].dtype == float) and (
-        validation_df[value_col].dtype == float
-    ):
-        computed_df[value_col] = computed_df[value_col].fillna(999)
-        validation_df[value_col] = validation_df[value_col].fillna(999)
-    else:
-        computed_df[value_col] = computed_df[value_col].fillna("zzz")
-        validation_df[value_col] = validation_df[value_col].fillna("zzz")
-    intersection = set(computed_df[value_col]).intersection(
-        set(validation_df[value_col])
-    )
-    union = set(computed_df[value_col]).union(set(validation_df[value_col]))
-    return float(len(intersection)) / float(len(union))
-
-
-def compute_ex21_validation_metrics(
-    computed_df: pd.DataFrame, validation_df: pd.DataFrame
-):
-    """Compute validation metrics for Ex. 21 extraction."""
-    shared_cols = validation_df.columns.intersection(computed_df.columns)
-    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
-    n_equal = 0
-    validation_filenames = validation_df["id"].unique()
-    n_files = len(validation_filenames)
-    table_metrics_dict = {}
-    jaccard_dict = {}
-    incorrect_files = []
-    # iterate through each file and check each extracted table
-    for filename in validation_filenames:
-        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
-            drop=True
-        )
-        validation_table_df = validation_df[
-            validation_df["id"] == filename
-        ].reset_index(drop=True)
-        # check if the tables are exactly equal
-        if extracted_table_df.equals(validation_table_df):
-            # TODO: strip llc and other company strings before comparison
-            n_equal += 1
-        else:
-            incorrect_files.append(filename)
-        # compute precision and recall for each column
-        table_metrics_dict[filename] = {}
-        jaccard_dict[filename] = {}
-        for col in ["subsidiary", "loc", "own_per"]:
-            table_prec_recall = compute_validation_metrics(
-                extracted_table_df, validation_table_df, value_col=col
-            )
-            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
-                "precision"
-            ]
-            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
-            # get the jaccard similarity between columns
-            jaccard_dict[filename][col] = jaccard_similarity(
-                computed_df=extracted_table_df,
-                validation_df=validation_table_df,
-                value_col=col,
-            )
-
-    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
-    prec_recall_df = pd.DataFrame.from_dict(
-        table_metrics_dict, orient="index"
-    ).reset_index()
-    _log_artifact_as_csv(
-        jaccard_df,
-        artifact_name="jaccard_per_table.csv",
-    )
-    _log_artifact_as_csv(
-        prec_recall_df,
-        artifact_name="precision_recall_per_table.csv",
-    )
-    _log_artifact_as_csv(
-        pd.DataFrame({"filename": incorrect_files}),
-        artifact_name="incorrect_filenames.csv",
-    )
-    return {
-        "table_accuracy": n_equal / n_files,
-        "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
-        "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
-        "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
-        "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
-        / n_files,
-        "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
-        "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() / n_files,
-        "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() / n_files,
-        "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
-        "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
-    }
-
-
-def clean_ex21_validation_set(validation_df: pd.DataFrame):
-    """Clean Ex. 21 validation data to match extracted format."""
-    validation_df = validation_df.rename(
-        columns={
-            "Filename": "id",
-            "Subsidiary": "subsidiary",
-            "Location of Incorporation": "loc",
-            "Ownership Percentage": "own_per",
-        }
-    )
-    validation_df["own_per"] = validation_df["own_per"].astype(str)
-    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
-    validation_df = clean_extracted_df(validation_df)
-    return validation_df
-
-
-def validate_extraction(
-    dataset: str, experiment_name: str, cloud_interface: GCSArchive
-):
-    """Run extraction on validation set and compare results to labeled data."""
-    validation_set = pd.read_csv(
-        resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv"
-    )
-    if dataset == "ex21":
-        validation_set = clean_ex21_validation_set(validation_set)
-
-    # Get metadata for labelled filings
-    to_extract = cloud_interface.get_metadata(
-        filenames=list(validation_set["filename"])
-    )
-
-    # Get filings to extract as well as any existing metadata for run
-    filings_to_extract, extraction_metadata, extracted, run_id = (
-        _get_filings_to_extract(
-            experiment_name,
-            to_extract,
-        )
-    )
-
-    # Extract data from filings
-    extracted = extract_filings(
-        dataset=dataset,
-        filings_to_extract=filings_to_extract,
-        extraction_metadata=extraction_metadata,
-        extracted=extracted,
-        num_filings=len(to_extract),
-        experiment_name=experiment_name,
-        cloud_interface=cloud_interface,
-        run_id=run_id,
-    )
-
-    # Set index for validation set based on returned extracted DF
-    validation_set = validation_set.set_index(extracted.index.names)
-
-    # Get extraction run from mlflow and start again to log validation metrics
-    run = _get_most_recent_run(experiment_name)
-    with mlflow.start_run(run_id=run.info.run_id):
-        # Compute metrics and log
-        if dataset == "basic_10k":
-            mlflow.log_metrics(
-                compute_validation_metrics(extracted, validation_set, "value")
-            )
-        else:
-            mlflow.log_metrics(
-                compute_ex21_validation_metrics(extracted, validation_set)
-            )
-        # Log validation set used to compute metrics
-        _log_artifact_as_csv(validation_set, "labels.csv")
-
-
-def validate_extraction_asset_factory(dataset: str):
-    """Create asset that extracts validation filings and compute validation metrics."""
-
-    @asset(
-        name=f"{dataset}_extract_validate",
-        required_resource_keys={
-            f"{dataset}_extract_validate_mlflow",
-            "cloud_interface",
-        },
-    )
-    def validate(context):
-        cloud_interface: GCSArchive = context.resources.cloud_interface
-        experiment_name = context.resources.original_resource_dict[
-            f"{dataset}_extract_validate_mlflow"
-        ].experiment_name
-        return validate_extraction(dataset, experiment_name, cloud_interface)
-
-    return validate
-
-
-basic_10k_extract = extract_asset_factory("basic_10k")
-basic_10k_validate = validate_extraction_asset_factory("basic_10k")
-ex21_extract = extract_asset_factory("ex21")
-ex21_validate = validate_extraction_asset_factory("ex21")
+basic_10k_extract = extract_model_factory("basic_10k", basic_10k.extract)
diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/utils/cloud.py
index 9248140..076a5de 100644
--- a/src/mozilla_sec_eia/utils/cloud.py
+++ b/src/mozilla_sec_eia/utils/cloud.py
@@ -3,7 +3,6 @@
 import base64
 import io
 import logging
-import os
 import re
 from contextlib import contextmanager
 from hashlib import md5
@@ -11,11 +10,10 @@
 from typing import BinaryIO, TextIO
 
 import fitz
-import mlflow
 import pandas as pd
 import pg8000
 from dagster import ConfigurableResource
-from google.cloud import secretmanager, storage
+from google.cloud import storage
 from google.cloud.sql.connector import Connector
 from PIL import Image
 from pydantic import BaseModel, PrivateAttr
@@ -400,51 +398,3 @@ def validate_archive(self) -> bool:
 def get_metadata_filename(local_filename: str):
     """Transform a local filename into the filename in GCSArchiver metadata."""
     return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt"
-
-
-def _access_secret_version(secret_id: str, project_id: str, version_id="latest"):
-    # Create the Secret Manager client.
-    client = secretmanager.SecretManagerServiceClient()
-
-    # Build the resource name of the secret version.
-    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
-
-    # Access the secret version.
-    response = client.access_secret_version(name=name)
-
-    # Return the decoded payload.
-    return response.payload.data.decode("UTF-8")
-
-
-class MlflowInterface(ConfigurableResource):
-    """Initialize interface to mlflow for desired experiment."""
-
-    experiment_name: str
-    continue_run: bool = False
-    tracking_uri: str
-    cloud_interface: GCSArchive
-    artifact_location: str | None = None
-
-    def setup_for_execution(self, context):
-        """Do runtime configuration of mlflow."""
-        os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
-        os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version(
-            "mlflow_admin_password", self.cloud_interface.project
-        )
-        os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri
-        os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
-        os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
-        os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
-        os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
-        logger.info(f"Initialized tracking with mlflow server: {self.tracking_uri}")
-
-        self.create_experiment()
-
-    def create_experiment(self):
-        """Create experiment if it doesn't already exist."""
-        logger.info(f"Creating experiment: {self.experiment_name}")
-        if not mlflow.get_experiment_by_name(self.experiment_name):
-            mlflow.create_experiment(
-                name=self.experiment_name,
-                artifact_location=self.artifact_location,
-            )
diff --git a/src/mozilla_sec_eia/utils/ml_tools/__init__.py b/src/mozilla_sec_eia/utils/ml_tools/__init__.py
new file mode 100644
index 0000000..f3448ee
--- /dev/null
+++ b/src/mozilla_sec_eia/utils/ml_tools/__init__.py
@@ -0,0 +1,13 @@
+"""Implements shared tooling for machine learning models in PUDL."""
+
+from . import models
+
+
+def get_ml_model_resources():
+    """Return default configuration for all PUDL models."""
+    return models.MODEL_RESOURCES
+
+
+def get_ml_model_jobs() -> list[str]:
+    """Return all jobs created through `pudl_model` decorator."""
+    return list(models.PUDL_MODELS.values())
diff --git a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py
new file mode 100644
index 0000000..394b8b1
--- /dev/null
+++ b/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py
@@ -0,0 +1,199 @@
+"""This module implements experiment tracking tooling using mlflow as a backend.
+
+:class:`ExperimentTracker`'s are created using an op factory :func:`experiment_tracker_factory`
+and can be passed around to op's which make up a PUDL model. This class will maintain
+state between ops, ensuring that all parameters and metrics are logged to the appropriate
+mlflow run. The following command will launch the mlflow UI to view model results:
+`mlflow ui --backend-store-uri {tracking_uri}`. `tracking_uri` by default will point
+to a file named 'experiments.sqlite' in the base directory of your PUDL repo, but
+this is a configurable value, which can be found in the dagster UI.
+"""
+
+import atexit
+import logging
+import os
+from contextlib import contextmanager
+
+import mlflow
+from dagster import ConfigurableResource, InitResourceContext, op
+from google.cloud import secretmanager
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+def _flatten_model_config(model_config: dict) -> dict:
+    """Take nested dictionary defining model config and flatten for logging purposes.
+
+    This is essentially a translation layer between Dagster configuration and mlflow,
+    which does not support displaying nested parameters in the UI.
+
+    Examples:
+        >>> _flatten_model_config(
+        ...     {
+        ...         'ferc_to_ferc': {
+        ...             'link_ids_cross_year': {
+        ...                 'compute_distance_matrix': {
+        ...                     'distance_threshold': .5,
+        ...                      'metric': 'euclidean',
+        ...                 },
+        ...                 'match_orphaned_records': {'distance_threshold': 0.5},
+        ...             }
+        ...         }
+        ...     }
+        ... ) == {
+        ...     'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.distance_threshold': 0.5,
+        ...     'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.metric': 'euclidean',
+        ...     'ferc_to_ferc.link_ids_cross_year.match_orphaned_records.distance_threshold': 0.5
+        ... }
+        True
+    """
+
+    def _flatten_level(config_level: dict, param_name: str):
+        flattened_dict = {}
+        for key, val in config_level.items():
+            flattened_param = f"{param_name}.{key}"
+            if isinstance(val, dict):
+                flattened_dict |= _flatten_level(val, param_name=flattened_param)
+            else:
+                flattened_dict[flattened_param[1:]] = val
+        return flattened_dict
+
+    return _flatten_level(model_config, "")
+
+
+class ExperimentTracker(ConfigurableResource):
+    """Class to manage tracking a machine learning model using MLflow.
+
+    The following command will launch the mlflow UI to view model results:
+    `mlflow ui --backend-store-uri {tracking_uri}`. From here, you can compare metrics
+    from multiple runs, and track performance.
+
+    This class is designed to be created using the `op` :func:`create_experiment_tracker`.
+    This allows the `ExperimentTracker` to be passed around within a Dagster `graph`,
+    and be used for mlflow logging in any of the `op`'s that make up the `graph`. This
+    is useful because Dagster executes `op`'s in separate processes, while mlflow does
+    not maintain state between processes. This design also allows configuration of
+    the ExperimentTracker to be set from the Dagster UI.
+
+    Currently, we are only doing experiment tracking in a local context, but if we were
+    to setup a tracking server, we could point the `tracking_uri` at this remote server
+    without having to modify the models. Experiment tracking can also be done outside
+    of the PUDL context. If doing exploratory work in a notebook, you can use mlflow
+    directly in a notebook with the same experiment name used here, and mlflow will
+    seamlessly integrate the results with those from PUDL runs.
+    """
+
+    tracking_uri: str
+    tracking_enabled: bool = True
+    artifact_location: str | None = None
+    experiment_name: str
+    tags: dict = {}
+    project: str
+
+    @contextmanager
+    def yield_for_execution(
+        self,
+        context: InitResourceContext,
+    ) -> "ExperimentTracker":
+        """Create experiment tracker for specified experiment."""
+        if self.tracking_enabled:
+            self._configure_mlflow()
+
+            # Get run_id associated with current dagster run
+            experiment_id = self.get_or_create_experiment(
+                experiment_name=self.experiment_name,
+                artifact_location=self.artifact_location,
+            )
+            mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id)
+
+            # Hack to stop mlflow from ending run at process barrier
+            # This is borrowed from the official dagster mlflow resource found here:
+            # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py
+            atexit.unregister(mlflow.end_run)
+
+            # Create new run under specified experiment
+            with mlflow.start_run(
+                run_id=mlflow_run_id,
+                experiment_id=experiment_id,
+                tags=self.tags | {"dagster_run_id": context.run_id},
+            ):
+                yield self
+
+    def _get_tracking_password(self, version_id: str = "latest"):
+        """Get tracking server password from gcloud secrets."""
+        # Create the Secret Manager client.
+        client = secretmanager.SecretManagerServiceClient()
+
+        # Build the resource name of the secret version.
+        name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}"
+
+        # Access the secret version.
+        response = client.access_secret_version(name=name)
+
+        # Return the decoded payload.
+        return response.payload.data.decode("UTF-8")
+
+    def _configure_mlflow(self):
+        """Do runtime configuration of mlflow."""
+        os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
+        os.environ["MLFLOW_TRACKING_PASSWORD"] = self._get_tracking_password()
+        os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri
+        os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
+        os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
+        os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
+        os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
+
+    def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str):
+        """Search for existing run tagged with dagster run id or start new run."""
+        run_df = mlflow.search_runs(
+            experiment_ids=[experiment_id],
+            filter_string=f"tags.dagster_run_id='{dagster_run_id}'",
+        )
+
+        run_id = None
+        if not run_df.empty:
+            run_id = run_df.loc[0, "run_id"]
+        return run_id
+
+    @staticmethod
+    def get_or_create_experiment(
+        experiment_name: str, artifact_location: str = ""
+    ) -> str:
+        """Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.
+
+        This function checks if an experiment with the given name exists within MLflow.
+        If it does, the function returns its ID. If not, it creates a new experiment
+        with the provided name and returns its ID.
+
+        Returns:
+            ID of the existing or newly created MLflow experiment.
+        """
+        if experiment := mlflow.get_experiment_by_name(experiment_name):
+            experiment_id = experiment.experiment_id
+        else:
+            experiment_id = mlflow.create_experiment(
+                experiment_name, artifact_location=artifact_location
+            )
+
+        return experiment_id
+
+
+def get_tracking_resource_name(experiment_name: str):
+    """Return expected name of experiment tracking resource given experiment name."""
+    return f"{experiment_name}_tracker"
+
+
+def experiment_tracker_teardown_factory(
+    experiment_name: str,
+) -> ExperimentTracker:
+    """Use config to create an experiment tracker."""
+    atexit.unregister(mlflow.end_run)
+
+    @op(
+        name=f"{experiment_name}_tracker_teardown",
+        required_resource_keys=[f"{experiment_name}_tracker"],
+    )
+    def teardown_experiment_tracker(_results):
+        mlflow.end_run()
+
+    return teardown_experiment_tracker
diff --git a/src/mozilla_sec_eia/utils/ml_tools/models.py b/src/mozilla_sec_eia/utils/ml_tools/models.py
new file mode 100644
index 0000000..606bdc7
--- /dev/null
+++ b/src/mozilla_sec_eia/utils/ml_tools/models.py
@@ -0,0 +1,162 @@
+"""Provides tooling for developing/tracking ml models within PUDL.
+
+The main interface from this module is the :func:`pudl_model` decorator, which
+is meant to be applied to a dagster `graph`. This decorator will handle finding all
+configuration for a model/passing configuration to dagster, creating an
+:class:`ExperimentTracker` for the model, and ultimately will return a `job`
+from the model.
+
+There are a few different ways to provide configuration for a PUDL model. First, configuration will come from default values for any dagster `Config`'s which are associated
+with `op`'s which make up the model `graph`. For more info on dagster configuration,
+see https://docs.dagster.io/concepts/configuration/config-schema. The next way to
+provide configuration is through the yaml file: `pudl.package_data.settings.pudl_models.yml`.
+Any configuration in this file should be follow dagster's config-schema formatting,
+see the `ferc_to_ferc` entry as an example. Configuration provided this way will
+override any default values. The final way to provide configuration is through the
+dagster UI. To provide configuration this way, click `Open Launchpad` in the UI, and
+values can be edited here. This configuration will override both default values and
+yaml configuration, but will only be used for a single run.
+"""
+
+import importlib
+import logging
+
+import mlflow
+import yaml
+from dagster import (
+    EnvVar,
+    GraphDefinition,
+    HookContext,
+    JobDefinition,
+    OpDefinition,
+    RunConfig,
+    job,
+    op,
+    success_hook,
+)
+
+from mozilla_sec_eia.utils import GCSArchive
+
+from .experiment_tracking import (
+    ExperimentTracker,
+    experiment_tracker_teardown_factory,
+    get_tracking_resource_name,
+)
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+MODEL_RESOURCES = {}
+PUDL_MODELS = {}
+
+
+def get_yml_config(experiment_name: str) -> dict:
+    """Load model configuration from yaml file."""
+    config_file = (
+        importlib.resources.files("pudl.package_data.settings") / "pudl_models.yml"
+    )
+    config = yaml.safe_load(config_file.open("r"))
+
+    if not (model_config := config.get(experiment_name)):
+        raise RuntimeError(f"No {experiment_name} entry in {config_file}")
+
+    return {experiment_name: model_config}
+
+
+def get_default_config(model_graph: GraphDefinition) -> dict:
+    """Get default config values for model."""
+
+    def _get_default_from_ops(node: OpDefinition | GraphDefinition):
+        config = {}
+        if isinstance(node, GraphDefinition):
+            config = {
+                "ops": {
+                    child_node.name: _get_default_from_ops(child_node)
+                    for child_node in node.node_defs
+                }
+            }
+        else:
+            if node.config_schema.default_provided:
+                config = {"config": node.config_schema.default_value}
+            else:
+                config = {"config": None}
+
+        return config
+
+    config = {model_graph.name: _get_default_from_ops(model_graph)}
+    return config
+
+
+def get_pudl_model_job_name(experiment_name: str) -> str:
+    """Return expected pudl model job name based on experiment_name."""
+    return f"{experiment_name}_job"
+
+
+def pudl_model(experiment_name: str, config_from_yaml: bool = False) -> JobDefinition:
+    """Decorator for an ML model that will handle providing configuration to dagster."""
+
+    def _decorator(model_graph: GraphDefinition):
+        model_config = get_default_config(model_graph)
+        if config_from_yaml:
+            model_config |= get_yml_config(model_graph.name)
+
+        # Add resources to resource dict
+        cloud_interface = GCSArchive(
+            filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
+            labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
+            metadata_db_instance_connection=EnvVar(
+                "GCS_METADATA_DB_INSTANCE_CONNECTION"
+            ),
+            user=EnvVar("GCS_IAM_USER"),
+            metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
+            project=EnvVar("GCS_PROJECT"),
+        )
+        MODEL_RESOURCES.update(
+            {
+                get_tracking_resource_name(experiment_name): ExperimentTracker(
+                    experiment_name=experiment_name,
+                    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+                    project=EnvVar("GCS_PROJECT"),
+                ),
+                "cloud_interface": cloud_interface,
+            }
+        )
+
+        default_config = RunConfig(
+            ops=model_config,
+        )
+
+        @op
+        def _collect_results(model_graph_output, _implicit_dependencies: list):
+            return model_graph_output
+
+        @success_hook(
+            required_resource_keys={get_tracking_resource_name(experiment_name)}
+        )
+        def _log_config_hook(context: HookContext):
+            if (config := context.op_config) is not None:
+                mlflow.log_params(
+                    {
+                        f"{context.op.name}.{param}": value
+                        for param, value in config.items()
+                    }
+                )
+
+        @job(
+            name=get_pudl_model_job_name(experiment_name),
+            config=default_config,
+            hooks={_log_config_hook},
+        )
+        def model_asset(**kwargs):
+            tracker_teardown = experiment_tracker_teardown_factory(
+                experiment_name=model_graph.name,
+            )
+            graph_output = model_graph(**kwargs)
+
+            # Pass output to teardown to create a dependency
+            teardown = tracker_teardown(graph_output)
+
+            _collect_results(graph_output, [teardown])
+
+        PUDL_MODELS[get_pudl_model_job_name(experiment_name)]
+        return model_asset
+
+    return _decorator

From 53d33545dc041463fe90926d03a5dc61e8e45e50 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 09:44:57 -0400
Subject: [PATCH 004/161] Reorg repo to move towards generalized modelling repo

---
 src/mozilla_sec_eia/__init__.py               |  5 +-
 .../{utils => library}/ml_tools/__init__.py   |  0
 .../ml_tools/experiment_tracking.py           | 15 ++++
 .../{utils => library}/ml_tools/models.py     | 23 ++---
 .../{assets.py => model_jobs.py}              |  2 +-
 src/mozilla_sec_eia/models/__init__.py        |  3 +
 src/mozilla_sec_eia/models/sec10k/__init__.py |  1 +
 .../{ => models/sec10k}/basic_10k.py          |  2 +-
 .../{ => models/sec10k}/ex_21/__init__.py     |  0
 .../sec10k}/ex_21/create_labeled_dataset.py   |  6 +-
 .../{ => models/sec10k}/ex_21/inference.py    | 18 ++--
 .../sec10k}/ex_21/rename_labeled_filings.py   |  2 +-
 .../sec10k}/ex_21/train_extractor.py          | 10 +--
 .../{ => models/sec10k}/extract.py            | 72 ++++++++++++++--
 .../{ => models/sec10k}/utils/__init__.py     |  0
 .../{ => models/sec10k}/utils/cloud.py        | 14 ++-
 .../{ => models/sec10k}/utils/db_metadata.py  |  0
 .../{ => models/sec10k}/utils/layoutlm.py     |  0
 .../{ => models/sec10k}/utils/pdf.py          |  0
 tests/conftest.py                             | 25 ++++++
 .../{ => models/sec10k}/extract_test.py       |  0
 .../{ => models/sec10k}/ex21_model_test.py    |  6 +-
 .../unit/{ => models/sec10k}/extract_test.py  | 85 +++++++++----------
 tests/unit/{ => models/sec10k}/utils_test.py  | 17 ++--
 24 files changed, 201 insertions(+), 105 deletions(-)
 rename src/mozilla_sec_eia/{utils => library}/ml_tools/__init__.py (100%)
 rename src/mozilla_sec_eia/{utils => library}/ml_tools/experiment_tracking.py (92%)
 rename src/mozilla_sec_eia/{utils => library}/ml_tools/models.py (87%)
 rename src/mozilla_sec_eia/{assets.py => model_jobs.py} (89%)
 create mode 100644 src/mozilla_sec_eia/models/__init__.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/__init__.py
 rename src/mozilla_sec_eia/{ => models/sec10k}/basic_10k.py (98%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/__init__.py (100%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/create_labeled_dataset.py (98%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/inference.py (98%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/rename_labeled_filings.py (98%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/train_extractor.py (94%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/extract.py (69%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/utils/__init__.py (100%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/utils/cloud.py (96%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/utils/db_metadata.py (100%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/utils/layoutlm.py (100%)
 rename src/mozilla_sec_eia/{ => models/sec10k}/utils/pdf.py (100%)
 rename tests/integration/{ => models/sec10k}/extract_test.py (100%)
 rename tests/unit/{ => models/sec10k}/ex21_model_test.py (85%)
 rename tests/unit/{ => models/sec10k}/extract_test.py (68%)
 rename tests/unit/{ => models/sec10k}/utils_test.py (89%)

diff --git a/src/mozilla_sec_eia/__init__.py b/src/mozilla_sec_eia/__init__.py
index 40ae3f8..74617af 100644
--- a/src/mozilla_sec_eia/__init__.py
+++ b/src/mozilla_sec_eia/__init__.py
@@ -1,14 +1,13 @@
 """A template repository for a Python package created by Catalyst Cooperative."""
 
 import logging
-from pathlib import Path
 
 import pkg_resources
 
 # In order for the package modules to be available when you import the package,
 # they need to be imported here somehow. Not sure if this is best practice though.
-import mozilla_sec_eia.cli
-import mozilla_sec_eia.utils  # noqa: F401
+import mozilla_sec_eia.library
+import mozilla_sec_eia.models
 
 __author__ = "Catalyst Cooperative"
 __contact__ = "pudl@catalyst.coop"
diff --git a/src/mozilla_sec_eia/utils/ml_tools/__init__.py b/src/mozilla_sec_eia/library/ml_tools/__init__.py
similarity index 100%
rename from src/mozilla_sec_eia/utils/ml_tools/__init__.py
rename to src/mozilla_sec_eia/library/ml_tools/__init__.py
diff --git a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py
similarity index 92%
rename from src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py
rename to src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py
index 394b8b1..adb3740 100644
--- a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py
+++ b/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py
@@ -197,3 +197,18 @@ def teardown_experiment_tracker(_results):
         mlflow.end_run()
 
     return teardown_experiment_tracker
+
+
+def get_most_recent_run(
+    experiment_name: str, dagster_run_id: str
+) -> mlflow.entities.Run:
+    """Search mlflow for most recent extraction run with specified experiment name."""
+    run_metadata = mlflow.search_runs(
+        experiment_names=[experiment_name],
+        filter_string=f"tags.dagster_run_id!='{dagster_run_id}'",
+    )
+
+    # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
+    # This assert will ensure this doesn't silently break if the ordering changes
+    assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max()
+    return mlflow.get_run(run_metadata.loc[0, "run_id"])
diff --git a/src/mozilla_sec_eia/utils/ml_tools/models.py b/src/mozilla_sec_eia/library/ml_tools/models.py
similarity index 87%
rename from src/mozilla_sec_eia/utils/ml_tools/models.py
rename to src/mozilla_sec_eia/library/ml_tools/models.py
index 606bdc7..139066f 100644
--- a/src/mozilla_sec_eia/utils/ml_tools/models.py
+++ b/src/mozilla_sec_eia/library/ml_tools/models.py
@@ -29,14 +29,13 @@
     HookContext,
     JobDefinition,
     OpDefinition,
+    ResourceDefinition,
     RunConfig,
     job,
     op,
     success_hook,
 )
 
-from mozilla_sec_eia.utils import GCSArchive
-
 from .experiment_tracking import (
     ExperimentTracker,
     experiment_tracker_teardown_factory,
@@ -90,7 +89,11 @@ def get_pudl_model_job_name(experiment_name: str) -> str:
     return f"{experiment_name}_job"
 
 
-def pudl_model(experiment_name: str, config_from_yaml: bool = False) -> JobDefinition:
+def pudl_model(
+    experiment_name: str,
+    resources: dict[str, ResourceDefinition] = {},
+    config_from_yaml: bool = False,
+) -> JobDefinition:
     """Decorator for an ML model that will handle providing configuration to dagster."""
 
     def _decorator(model_graph: GraphDefinition):
@@ -99,16 +102,6 @@ def _decorator(model_graph: GraphDefinition):
             model_config |= get_yml_config(model_graph.name)
 
         # Add resources to resource dict
-        cloud_interface = GCSArchive(
-            filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
-            labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
-            metadata_db_instance_connection=EnvVar(
-                "GCS_METADATA_DB_INSTANCE_CONNECTION"
-            ),
-            user=EnvVar("GCS_IAM_USER"),
-            metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
-            project=EnvVar("GCS_PROJECT"),
-        )
         MODEL_RESOURCES.update(
             {
                 get_tracking_resource_name(experiment_name): ExperimentTracker(
@@ -116,8 +109,8 @@ def _decorator(model_graph: GraphDefinition):
                     tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
                     project=EnvVar("GCS_PROJECT"),
                 ),
-                "cloud_interface": cloud_interface,
             }
+            | resources
         )
 
         default_config = RunConfig(
@@ -156,7 +149,7 @@ def model_asset(**kwargs):
 
             _collect_results(graph_output, [teardown])
 
-        PUDL_MODELS[get_pudl_model_job_name(experiment_name)]
+        PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_asset
         return model_asset
 
     return _decorator
diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/model_jobs.py
similarity index 89%
rename from src/mozilla_sec_eia/assets.py
rename to src/mozilla_sec_eia/model_jobs.py
index 748690d..9e5a5b3 100644
--- a/src/mozilla_sec_eia/assets.py
+++ b/src/mozilla_sec_eia/model_jobs.py
@@ -5,7 +5,7 @@
 import coloredlogs
 from dagster import Definitions
 
-from mozilla_sec_eia.utils import ml_tools
+from mozilla_sec_eia.library import ml_tools
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
diff --git a/src/mozilla_sec_eia/models/__init__.py b/src/mozilla_sec_eia/models/__init__.py
new file mode 100644
index 0000000..ba3ac6a
--- /dev/null
+++ b/src/mozilla_sec_eia/models/__init__.py
@@ -0,0 +1,3 @@
+"""Implement specific PUDL models in this module."""
+
+from .sec10k import extract
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
new file mode 100644
index 0000000..001c6ad
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -0,0 +1 @@
+"""Implement models to extract data from SEC10k filings."""
diff --git a/src/mozilla_sec_eia/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
similarity index 98%
rename from src/mozilla_sec_eia/basic_10k.py
rename to src/mozilla_sec_eia/models/sec10k/basic_10k.py
index cf04ada..e5b5f72 100644
--- a/src/mozilla_sec_eia/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -5,7 +5,7 @@
 import pandas as pd
 from dagster import Out, op
 
-from mozilla_sec_eia.utils.cloud import GCSArchive, Sec10K
+from .utils.cloud import GCSArchive, Sec10K
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 EXPERIMENT_NAME = "basic_10k_extraction"
diff --git a/src/mozilla_sec_eia/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
similarity index 100%
rename from src/mozilla_sec_eia/ex_21/__init__.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
diff --git a/src/mozilla_sec_eia/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
similarity index 98%
rename from src/mozilla_sec_eia/ex_21/create_labeled_dataset.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
index 8353dca..55e1d5a 100644
--- a/src/mozilla_sec_eia/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
@@ -7,9 +7,9 @@
 
 import pandas as pd
 
-from mozilla_sec_eia.utils.cloud import GCSArchive
-from mozilla_sec_eia.utils.layoutlm import normalize_bboxes
-from mozilla_sec_eia.utils.pdf import (
+from ..utils.cloud import GCSArchive
+from ..utils.layoutlm import normalize_bboxes
+from ..utils.pdf import (
     get_pdf_data_from_path,
     pil_to_cv2,
     render_page,
diff --git a/src/mozilla_sec_eia/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
similarity index 98%
rename from src/mozilla_sec_eia/ex_21/inference.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index ce1926f..2016630 100644
--- a/src/mozilla_sec_eia/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -16,21 +16,21 @@
 )
 from transformers.tokenization_utils_base import BatchEncoding
 
-from mozilla_sec_eia.ex_21.create_labeled_dataset import (
-    BBOX_COLS_PDF,
-    format_label_studio_output,
-    get_image_dict,
-)
-from mozilla_sec_eia.ex_21.train_extractor import BBOX_COLS, LABELS
-from mozilla_sec_eia.utils.cloud import get_metadata_filename
-from mozilla_sec_eia.utils.layoutlm import (
+from ..utils.cloud import get_metadata_filename
+from ..utils.layoutlm import (
     get_id_label_conversions,
     iob_to_label,
     normalize_bboxes,
 )
-from mozilla_sec_eia.utils.pdf import (
+from ..utils.pdf import (
     get_pdf_data_from_path,
 )
+from .create_labeled_dataset import (
+    BBOX_COLS_PDF,
+    format_label_studio_output,
+    get_image_dict,
+)
+from .train_extractor import BBOX_COLS, LABELS
 
 # When handling multi page documents LayoutLM uses a sliding 'frame'
 # with some overlap between frames. The overlap creates multiple
diff --git a/src/mozilla_sec_eia/ex_21/rename_labeled_filings.py b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
similarity index 98%
rename from src/mozilla_sec_eia/ex_21/rename_labeled_filings.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
index 9c7e4e5..182dd04 100644
--- a/src/mozilla_sec_eia/ex_21/rename_labeled_filings.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
@@ -5,7 +5,7 @@
 
 import pandas as pd
 
-from mozilla_sec_eia.utils.cloud import GCSArchive
+from ..utils.cloud import GCSArchive
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
diff --git a/src/mozilla_sec_eia/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
similarity index 94%
rename from src/mozilla_sec_eia/ex_21/train_extractor.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
index e771fbb..53ed85e 100644
--- a/src/mozilla_sec_eia/ex_21/train_extractor.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
@@ -9,7 +9,7 @@
 
 import mlflow
 import numpy as np
-from dagster import Config, asset
+from dagster import Config
 from datasets import (
     Array2D,
     Array3D,
@@ -27,9 +27,8 @@
 )
 from transformers.data.data_collator import default_data_collator
 
-from mozilla_sec_eia.ex_21.create_labeled_dataset import format_as_ner_annotations
-from mozilla_sec_eia.utils.cloud import MlflowInterface
-from mozilla_sec_eia.utils.layoutlm import get_id_label_conversions, log_model
+from ..utils.layoutlm import get_id_label_conversions, log_model
+from .create_labeled_dataset import format_as_ner_annotations
 
 LABELS = [
     "O",
@@ -144,10 +143,9 @@ class FineTuneConfig(Config):
     test_size: float = 0.2
 
 
-@asset
 def train_model(
     config: FineTuneConfig,
-    layoutlm_mlflow_interface: MlflowInterface,
+    layoutlm_mlflow_interface,
 ):
     """Train LayoutLM model with labeled data."""
     # Prepare model
diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
similarity index 69%
rename from src/mozilla_sec_eia/extract.py
rename to src/mozilla_sec_eia/models/sec10k/extract.py
index 90d87ba..602e57a 100644
--- a/src/mozilla_sec_eia/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -16,17 +16,21 @@
     DynamicOutput,
     GraphDefinition,
     OpDefinition,
+    OpExecutionContext,
+    Out,
     graph,
     op,
 )
 from mlflow.entities import Run
 
-from mozilla_sec_eia import basic_10k
-from mozilla_sec_eia.utils.cloud import GCSArchive
-from mozilla_sec_eia.utils.ml_tools.experiment_tracking import (
+from mozilla_sec_eia.library.ml_tools.experiment_tracking import (
+    get_most_recent_run,
     get_tracking_resource_name,
 )
-from mozilla_sec_eia.utils.ml_tools.models import pudl_model
+from mozilla_sec_eia.library.ml_tools.models import pudl_model
+
+from . import basic_10k
+from .utils.cloud import GCSArchive, cloud_interface_resource
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -107,6 +111,12 @@ def chunk_filings(
         yield DynamicOutput(chunk, mapping_key=str(i))
 
 
+class GetMostRecentRunResultsConfig(Config):
+    """Configuration specifying whether to get run results and continue."""
+
+    continue_run: bool = False
+
+
 def extract_model_factory(
     dataset_name: str, extract_op: OpDefinition | GraphDefinition
 ):
@@ -114,14 +124,50 @@ def extract_model_factory(
     experiment_name = f"{dataset_name}_extraction"
     experiment_tracker_resource = get_tracking_resource_name(experiment_name)
 
+    @op(
+        required_resource_keys=[experiment_tracker_resource],
+        out={
+            "extraction_metadata": Out(),
+            "extracted": Out(),
+            "filings_to_extract": Out(),
+        },
+    )
+    def get_most_recent_run_results(
+        context: OpExecutionContext,
+        config: GetMostRecentRunResultsConfig,
+        filings_to_extract: pd.DataFrame,
+    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+        extraction_metadata = pd.DataFrame(
+            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+        ).set_index("filename")
+        extracted = pd.DataFrame()
+
+        if config.continue_run:
+            most_recent_run = get_most_recent_run(experiment_name, context.run_id)
+            extraction_metadata = ExtractionMetadataSchema.validate(
+                _load_artifact_as_csv(
+                    most_recent_run, "/extraction_metadata.csv"
+                ).set_index("filename")
+            )
+            extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
+            filings_to_extract = filings_to_extract[
+                ~filings_to_extract["filename"].isin(extraction_metadata.index)
+            ]
+
+        return extraction_metadata, extracted, filings_to_extract
+
     @op(required_resource_keys=[experiment_tracker_resource])
     def log_extraction_data(
         metadata: pd.DataFrame,
         extraction_metadata: list[pd.DataFrame],
         extracted: list[pd.DataFrame],
+        previous_run_extraction_metadata: pd.DataFrame,
+        previous_run_extracted_data: pd.DataFrame,
     ):
-        extraction_metadata = pd.concat(extraction_metadata)
-        extracted = pd.concat(extracted)
+        extraction_metadata = pd.concat(
+            extraction_metadata + [previous_run_extraction_metadata]
+        )
+        extracted = pd.concat(extracted + [previous_run_extracted_data])
         # Use metadata to log generic metrics
         extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
         mlflow.log_metrics(
@@ -135,17 +181,25 @@ def log_extraction_data(
         _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
         _log_artifact_as_parquet(extracted, "extracted.parquet")
 
-    @pudl_model(experiment_name=experiment_name)
+    @pudl_model(
+        experiment_name=experiment_name,
+        resources={"cloud_interface": cloud_interface_resource},
+    )
     @graph(name=experiment_name)
     def extract_filings():
-        filings_to_extract = get_filings_to_extract()
+        metadata = get_filings_to_extract()
+        previous_extraction_metadata, previous_extracted, filings_to_extract = (
+            get_most_recent_run_results(metadata)
+        )
         filing_chunks = chunk_filings(filings_to_extract)
         extraction_metadata, extracted = filing_chunks.map(extract_op)
 
         return log_extraction_data(
-            filings_to_extract,
+            metadata,
             extraction_metadata.collect(),
             extracted.collect(),
+            previous_extraction_metadata,
+            previous_extracted,
         )
 
     return extract_filings
diff --git a/src/mozilla_sec_eia/utils/__init__.py b/src/mozilla_sec_eia/models/sec10k/utils/__init__.py
similarity index 100%
rename from src/mozilla_sec_eia/utils/__init__.py
rename to src/mozilla_sec_eia/models/sec10k/utils/__init__.py
diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
similarity index 96%
rename from src/mozilla_sec_eia/utils/cloud.py
rename to src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 076a5de..1232f45 100644
--- a/src/mozilla_sec_eia/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -12,7 +12,7 @@
 import fitz
 import pandas as pd
 import pg8000
-from dagster import ConfigurableResource
+from dagster import ConfigurableResource, EnvVar
 from google.cloud import storage
 from google.cloud.sql.connector import Connector
 from PIL import Image
@@ -21,7 +21,7 @@
 from sqlalchemy.orm import Session
 from xhtml2pdf import pisa
 
-from mozilla_sec_eia.utils.db_metadata import Base, Sec10kMetadata
+from .db_metadata import Base, Sec10kMetadata
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -398,3 +398,13 @@ def validate_archive(self) -> bool:
 def get_metadata_filename(local_filename: str):
     """Transform a local filename into the filename in GCSArchiver metadata."""
     return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt"
+
+
+cloud_interface_resource = GCSArchive(
+    filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
+    labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
+    metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
+    user=EnvVar("GCS_IAM_USER"),
+    metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
+    project=EnvVar("GCS_PROJECT"),
+)
diff --git a/src/mozilla_sec_eia/utils/db_metadata.py b/src/mozilla_sec_eia/models/sec10k/utils/db_metadata.py
similarity index 100%
rename from src/mozilla_sec_eia/utils/db_metadata.py
rename to src/mozilla_sec_eia/models/sec10k/utils/db_metadata.py
diff --git a/src/mozilla_sec_eia/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
similarity index 100%
rename from src/mozilla_sec_eia/utils/layoutlm.py
rename to src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
diff --git a/src/mozilla_sec_eia/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
similarity index 100%
rename from src/mozilla_sec_eia/utils/pdf.py
rename to src/mozilla_sec_eia/models/sec10k/utils/pdf.py
diff --git a/tests/conftest.py b/tests/conftest.py
index 6db667b..7fafe27 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import pytest
+from mozilla_sec_eia.library.ml_tools.experiment_tracking import ExperimentTracker
 
 logger = logging.getLogger(__name__)
 
@@ -33,3 +34,27 @@ def test_dir() -> Path:
     Mostly this is meant as an example of a fixture.
     """
     return Path(__file__).parent
+
+
+class TestTracker(ExperimentTracker):
+    """Create sub-class of `ExperimentTracker` to use in testing context.
+
+    Test class creates an in-memory sqlite db for tracking, and a temporary directory
+    for artifact storage.
+    """
+
+    def _get_tracking_password(self):
+        return "password"
+
+
+@pytest.fixture
+def test_tracker_factory(tmp_path):
+    def factory(experiment_name: str) -> TestTracker:
+        return TestTracker(
+            artifact_location=str(tmp_path),
+            tracking_uri="sqlite:///:memory:",
+            experiment_name=experiment_name,
+            project="",
+        )
+
+    return factory
diff --git a/tests/integration/extract_test.py b/tests/integration/models/sec10k/extract_test.py
similarity index 100%
rename from tests/integration/extract_test.py
rename to tests/integration/models/sec10k/extract_test.py
diff --git a/tests/unit/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py
similarity index 85%
rename from tests/unit/ex21_model_test.py
rename to tests/unit/models/sec10k/ex21_model_test.py
index 30f10a5..2953ff8 100644
--- a/tests/unit/ex21_model_test.py
+++ b/tests/unit/models/sec10k/ex21_model_test.py
@@ -1,9 +1,9 @@
 """Unit tests for the LayoutLM model and table extractor."""
 
 import torch
-from mozilla_sec_eia.ex_21.inference import get_flattened_mode_predictions
-from mozilla_sec_eia.ex_21.train_extractor import LABELS
-from mozilla_sec_eia.utils.layoutlm import get_id_label_conversions
+from mozilla_sec_eia.models.sec10k.ex_21.inference import get_flattened_mode_predictions
+from mozilla_sec_eia.models.sec10k.ex_21.train_extractor import LABELS
+from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions
 
 
 def test_bbox_overlap_prediction_tie_break():
diff --git a/tests/unit/extract_test.py b/tests/unit/models/sec10k/extract_test.py
similarity index 68%
rename from tests/unit/extract_test.py
rename to tests/unit/models/sec10k/extract_test.py
index 4441c2b..5ee7086 100644
--- a/tests/unit/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -1,18 +1,20 @@
 """Test extraction tools/methods."""
 
 import logging
-import unittest
 
 import pandas as pd
 import pytest
-from dagster import build_asset_context
-from mozilla_sec_eia.extract import (
-    ExtractConfig,
-    _get_most_recent_run,
-    basic_10k_extract,
+from dagster import Out, op
+from mozilla_sec_eia.library.ml_tools.experiment_tracking import (
+    get_most_recent_run,
+    get_tracking_resource_name,
+)
+from mozilla_sec_eia.models.sec10k.extract import (
+    ChunkFilingsConfig,
     compute_validation_metrics,
+    extract_model_factory,
 )
-from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
+from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -62,11 +64,11 @@ def second_run_results():
     )
 
 
-def test_extract_basic_10k(
+def test_sec10k_extract_pipeline(
     filings_metadata,
     first_run_results,
     second_run_results,
-    tmp_path,
+    test_tracker_factory,
 ):
     """Test high level extraction workflow."""
 
@@ -84,44 +86,33 @@ def setup_for_execution(self, context):
         def get_metadata(self):
             return filings_metadata
 
-    # Initialize mlflow with test settings
-    experiment_name = "basic_10k_extract_unit_test"
-
-    with unittest.mock.patch(
-        "mozilla_sec_eia.utils.cloud._access_secret_version", new=lambda *args: ""
-    ):
-        for i, results in enumerate([first_run_results, second_run_results]):
-            logger.info(f"Run {i} of basic 10k extraction.")
-            with (
-                build_asset_context(
-                    resources={
-                        "basic_10k_extract_config": ExtractConfig(
-                            num_filings=3 if i == 0 else -1
-                        ),
-                        "basic_10k_extract_mlflow": MlflowInterface(
-                            experiment_name=experiment_name,
-                            continue_run=i > 0,
-                            tracking_uri="sqlite:///:memory:",
-                            cloud_interface=FakeArchive(),
-                            artifact_location=str(tmp_path),
-                        ),
-                        "cloud_interface": FakeArchive(),
-                    }
-                ) as context,
-                unittest.mock.patch(
-                    "mozilla_sec_eia.extract.basic_10k.extract",
-                    new=lambda *args: results,
-                ),
-            ):
-                metadata = results[0]
-
-                # Run extract method
-                basic_10k_extract(context)
-                run = _get_most_recent_run(experiment_name)
-                assert run.data.metrics["num_failed"] == (~metadata["success"]).sum()
-                assert run.data.metrics["ratio_extracted"] == len(metadata) / len(
-                    filings_metadata
-                )
+    dataset_name = "test_pipeline"
+    experiment_name = f"{dataset_name}_extraction"
+    test_tracker = test_tracker_factory(experiment_name)
+
+    for i, results in enumerate([first_run_results, second_run_results]):
+
+        @op(out={"extraction_metadata": Out(), "extracted": Out()})
+        def _fake_extract(_filings_to_extract):
+            return results[0], results[1]
+
+        test_job = extract_model_factory(dataset_name, _fake_extract)
+        resources = {
+            "basic_10k_extract_config": ChunkFilingsConfig(
+                num_filings=3 if i == 0 else -1
+            ),
+            get_tracking_resource_name(experiment_name): test_tracker,
+            "cloud_interface": FakeArchive(),
+        }
+        metadata = results[0]
+
+        # Run extract method
+        test_job.execute_in_process(resources=resources)
+        run = get_most_recent_run(experiment_name, dagster_run_id="")
+        assert run.data.metrics["num_failed"] == (~metadata["success"]).sum()
+        assert run.data.metrics["ratio_extracted"] == len(metadata) / len(
+            filings_metadata
+        )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/unit/utils_test.py b/tests/unit/models/sec10k/utils_test.py
similarity index 89%
rename from tests/unit/utils_test.py
rename to tests/unit/models/sec10k/utils_test.py
index b935c8f..de9fbd7 100644
--- a/tests/unit/utils_test.py
+++ b/tests/unit/models/sec10k/utils_test.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 import pytest
-from mozilla_sec_eia.utils.cloud import (
+from mozilla_sec_eia.models.sec10k.utils.cloud import (
     Exhibit21,
     GCSArchive,
     Sec10K,
@@ -17,8 +17,12 @@
 def test_archive():
     """Return test GCSArchive class."""
     with (
-        unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_engine"),
-        unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_bucket"),
+        unittest.mock.patch(
+            "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_engine"
+        ),
+        unittest.mock.patch(
+            "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_bucket"
+        ),
     ):
         archive = GCSArchive(
             filings_bucket_name="filings_bucket_name",
@@ -91,7 +95,8 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo
         return_value=pd.DataFrame({"filename": metadata_files})
     )
     mocker.patch(
-        "mozilla_sec_eia.utils.cloud.GCSArchive.get_metadata", new=metadata_mock
+        "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata",
+        new=metadata_mock,
     )
 
     assert test_archive.validate_archive() == valid
@@ -174,7 +179,9 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo
 )
 def test_10k(filing_text, ex_21_version, actually_has_ex_21):
     """Test that SEC10k's are properly parsed."""
-    with unittest.mock.patch("mozilla_sec_eia.utils.cloud.logger") as mock_logger:
+    with unittest.mock.patch(
+        "mozilla_sec_eia.models.sec10k.utils.cloud.logger"
+    ) as mock_logger:
         filing = Sec10K.from_file(
             file=io.StringIO(filing_text),
             filename="sec10k.html",

From 014bcb1048d8940a18c9f1df2e2af971790c3de0 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 10:06:03 -0400
Subject: [PATCH 005/161] Change library module structure

---
 src/mozilla_sec_eia/library/{ml_tools => }/__init__.py      | 0
 .../library/{ml_tools => }/experiment_tracking.py           | 0
 src/mozilla_sec_eia/library/{ml_tools => }/models.py        | 0
 src/mozilla_sec_eia/model_jobs.py                           | 6 +++---
 src/mozilla_sec_eia/models/sec10k/extract.py                | 4 ++--
 tests/conftest.py                                           | 2 +-
 tests/unit/models/sec10k/extract_test.py                    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename src/mozilla_sec_eia/library/{ml_tools => }/__init__.py (100%)
 rename src/mozilla_sec_eia/library/{ml_tools => }/experiment_tracking.py (100%)
 rename src/mozilla_sec_eia/library/{ml_tools => }/models.py (100%)

diff --git a/src/mozilla_sec_eia/library/ml_tools/__init__.py b/src/mozilla_sec_eia/library/__init__.py
similarity index 100%
rename from src/mozilla_sec_eia/library/ml_tools/__init__.py
rename to src/mozilla_sec_eia/library/__init__.py
diff --git a/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/library/experiment_tracking.py
similarity index 100%
rename from src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py
rename to src/mozilla_sec_eia/library/experiment_tracking.py
diff --git a/src/mozilla_sec_eia/library/ml_tools/models.py b/src/mozilla_sec_eia/library/models.py
similarity index 100%
rename from src/mozilla_sec_eia/library/ml_tools/models.py
rename to src/mozilla_sec_eia/library/models.py
diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/model_jobs.py
index 9e5a5b3..e6438ab 100644
--- a/src/mozilla_sec_eia/model_jobs.py
+++ b/src/mozilla_sec_eia/model_jobs.py
@@ -5,13 +5,13 @@
 import coloredlogs
 from dagster import Definitions
 
-from mozilla_sec_eia.library import ml_tools
+from mozilla_sec_eia.library import get_ml_model_jobs, get_ml_model_resources
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
 defs = Definitions(
-    jobs=ml_tools.get_ml_model_jobs(),
-    resources=ml_tools.get_ml_model_resources(),
+    jobs=get_ml_model_jobs(),
+    resources=get_ml_model_resources(),
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 602e57a..a6fcf1e 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -23,11 +23,11 @@
 )
 from mlflow.entities import Run
 
-from mozilla_sec_eia.library.ml_tools.experiment_tracking import (
+from mozilla_sec_eia.library.experiment_tracking import (
     get_most_recent_run,
     get_tracking_resource_name,
 )
-from mozilla_sec_eia.library.ml_tools.models import pudl_model
+from mozilla_sec_eia.library.models import pudl_model
 
 from . import basic_10k
 from .utils.cloud import GCSArchive, cloud_interface_resource
diff --git a/tests/conftest.py b/tests/conftest.py
index 7fafe27..c10e825 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 
 import pytest
-from mozilla_sec_eia.library.ml_tools.experiment_tracking import ExperimentTracker
+from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 5ee7086..c5db638 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 from dagster import Out, op
-from mozilla_sec_eia.library.ml_tools.experiment_tracking import (
+from mozilla_sec_eia.library.experiment_tracking import (
     get_most_recent_run,
     get_tracking_resource_name,
 )

From 54041487d81f0ad1e080854b307ce237b3c4b3ab Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 10:17:15 -0400
Subject: [PATCH 006/161] Create turn experiment_tracking into sub-package

---
 src/mozilla_sec_eia/cli.py                                | 6 ------
 .../library/experiment_tracking/__init__.py               | 8 ++++++++
 .../mlflow_resource.py}                                   | 0
 3 files changed, 8 insertions(+), 6 deletions(-)
 delete mode 100755 src/mozilla_sec_eia/cli.py
 create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/__init__.py
 rename src/mozilla_sec_eia/library/{experiment_tracking.py => experiment_tracking/mlflow_resource.py} (100%)

diff --git a/src/mozilla_sec_eia/cli.py b/src/mozilla_sec_eia/cli.py
deleted file mode 100755
index 2d4a84a..0000000
--- a/src/mozilla_sec_eia/cli.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Implements CLI for SEC to EIA linkage development.
-
-CLI is structured with nested sub-commands to make it easy
-to add new scripts which can be accessed through one top-level
-interface.
-"""
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
new file mode 100644
index 0000000..7f9e7f9
--- /dev/null
+++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
@@ -0,0 +1,8 @@
+"""Implement tooling to interface with mlflow experiment tracking."""
+
+from .mlflow_resource import (
+    ExperimentTracker,
+    experiment_tracker_teardown_factory,
+    get_most_recent_run,
+    get_tracking_resource_name,
+)
diff --git a/src/mozilla_sec_eia/library/experiment_tracking.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
similarity index 100%
rename from src/mozilla_sec_eia/library/experiment_tracking.py
rename to src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py

From 886614fdadc24a33415fd493cee2600a2c8424be Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 10:28:54 -0400
Subject: [PATCH 007/161] Remove unused function

---
 .../experiment_tracking/mlflow_resource.py    | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
index adb3740..b033d4c 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
@@ -21,46 +21,6 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-def _flatten_model_config(model_config: dict) -> dict:
-    """Take nested dictionary defining model config and flatten for logging purposes.
-
-    This is essentially a translation layer between Dagster configuration and mlflow,
-    which does not support displaying nested parameters in the UI.
-
-    Examples:
-        >>> _flatten_model_config(
-        ...     {
-        ...         'ferc_to_ferc': {
-        ...             'link_ids_cross_year': {
-        ...                 'compute_distance_matrix': {
-        ...                     'distance_threshold': .5,
-        ...                      'metric': 'euclidean',
-        ...                 },
-        ...                 'match_orphaned_records': {'distance_threshold': 0.5},
-        ...             }
-        ...         }
-        ...     }
-        ... ) == {
-        ...     'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.distance_threshold': 0.5,
-        ...     'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.metric': 'euclidean',
-        ...     'ferc_to_ferc.link_ids_cross_year.match_orphaned_records.distance_threshold': 0.5
-        ... }
-        True
-    """
-
-    def _flatten_level(config_level: dict, param_name: str):
-        flattened_dict = {}
-        for key, val in config_level.items():
-            flattened_param = f"{param_name}.{key}"
-            if isinstance(val, dict):
-                flattened_dict |= _flatten_level(val, param_name=flattened_param)
-            else:
-                flattened_dict[flattened_param[1:]] = val
-        return flattened_dict
-
-    return _flatten_level(model_config, "")
-
-
 class ExperimentTracker(ConfigurableResource):
     """Class to manage tracking a machine learning model using MLflow.
 

From dec80b81cce5113834f2826a31dd906757ac5a04 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 10:40:06 -0400
Subject: [PATCH 008/161] Gracefully handle mlflow run on failure

---
 src/mozilla_sec_eia/library/models.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 139066f..1094f9f 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -31,10 +31,12 @@
     OpDefinition,
     ResourceDefinition,
     RunConfig,
+    failure_hook,
     job,
     op,
     success_hook,
 )
+from mlflow.entities.run_status import RunStatus
 
 from .experiment_tracking import (
     ExperimentTracker,
@@ -133,10 +135,21 @@ def _log_config_hook(context: HookContext):
                     }
                 )
 
+        @failure_hook(
+            required_resource_keys={get_tracking_resource_name(experiment_name)}
+        )
+        def _end_mlflow_run_with_failure(context: HookContext):
+            exception = context.op_exception
+
+            if isinstance(exception, KeyboardInterrupt):
+                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
+            else:
+                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
+
         @job(
             name=get_pudl_model_job_name(experiment_name),
             config=default_config,
-            hooks={_log_config_hook},
+            hooks={_log_config_hook, _end_mlflow_run_with_failure},
         )
         def model_asset(**kwargs):
             tracker_teardown = experiment_tracker_teardown_factory(

From e725f3dedbbd3ee82dc4750cbc0c382525aa874b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 11:01:27 -0400
Subject: [PATCH 009/161] Fix variable name

---
 src/mozilla_sec_eia/library/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 1094f9f..1d8b38d 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -151,7 +151,7 @@ def _end_mlflow_run_with_failure(context: HookContext):
             config=default_config,
             hooks={_log_config_hook, _end_mlflow_run_with_failure},
         )
-        def model_asset(**kwargs):
+        def model_job(**kwargs):
             tracker_teardown = experiment_tracker_teardown_factory(
                 experiment_name=model_graph.name,
             )
@@ -162,7 +162,7 @@ def model_asset(**kwargs):
 
             _collect_results(graph_output, [teardown])
 
-        PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_asset
-        return model_asset
+        PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job
+        return model_job
 
     return _decorator

From df44ed5eee5a7d8e75f2e177a037088a597c08ad Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 12:35:36 -0400
Subject: [PATCH 010/161] Change experiment tracker resource names

---
 .../library/experiment_tracking/__init__.py   |   1 -
 .../experiment_tracking/mlflow_resource.py    |   7 +-
 src/mozilla_sec_eia/library/models.py         |  27 ++--
 src/mozilla_sec_eia/model_jobs.py             |   3 +-
 src/mozilla_sec_eia/models/sec10k/extract.py  | 130 ++++++++++--------
 tests/unit/models/sec10k/extract_test.py      |   9 +-
 6 files changed, 86 insertions(+), 91 deletions(-)

diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
index 7f9e7f9..65cbf7a 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
@@ -4,5 +4,4 @@
     ExperimentTracker,
     experiment_tracker_teardown_factory,
     get_most_recent_run,
-    get_tracking_resource_name,
 )
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
index b033d4c..f2c113f 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
@@ -138,11 +138,6 @@ def get_or_create_experiment(
         return experiment_id
 
 
-def get_tracking_resource_name(experiment_name: str):
-    """Return expected name of experiment tracking resource given experiment name."""
-    return f"{experiment_name}_tracker"
-
-
 def experiment_tracker_teardown_factory(
     experiment_name: str,
 ) -> ExperimentTracker:
@@ -151,7 +146,7 @@ def experiment_tracker_teardown_factory(
 
     @op(
         name=f"{experiment_name}_tracker_teardown",
-        required_resource_keys=[f"{experiment_name}_tracker"],
+        required_resource_keys=["experiment_tracker"],
     )
     def teardown_experiment_tracker(_results):
         mlflow.end_run()
diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 1d8b38d..135df39 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -41,7 +41,6 @@
 from .experiment_tracking import (
     ExperimentTracker,
     experiment_tracker_teardown_factory,
-    get_tracking_resource_name,
 )
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -104,16 +103,13 @@ def _decorator(model_graph: GraphDefinition):
             model_config |= get_yml_config(model_graph.name)
 
         # Add resources to resource dict
-        MODEL_RESOURCES.update(
-            {
-                get_tracking_resource_name(experiment_name): ExperimentTracker(
-                    experiment_name=experiment_name,
-                    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-                    project=EnvVar("GCS_PROJECT"),
-                ),
-            }
-            | resources
-        )
+        model_resources = {
+            "experiment_tracker": ExperimentTracker(
+                experiment_name=experiment_name,
+                tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+                project=EnvVar("GCS_PROJECT"),
+            ),
+        } | resources
 
         default_config = RunConfig(
             ops=model_config,
@@ -123,9 +119,7 @@ def _decorator(model_graph: GraphDefinition):
         def _collect_results(model_graph_output, _implicit_dependencies: list):
             return model_graph_output
 
-        @success_hook(
-            required_resource_keys={get_tracking_resource_name(experiment_name)}
-        )
+        @success_hook(required_resource_keys={"experiment_tracker"})
         def _log_config_hook(context: HookContext):
             if (config := context.op_config) is not None:
                 mlflow.log_params(
@@ -135,9 +129,7 @@ def _log_config_hook(context: HookContext):
                     }
                 )
 
-        @failure_hook(
-            required_resource_keys={get_tracking_resource_name(experiment_name)}
-        )
+        @failure_hook(required_resource_keys={"experiment_tracker"})
         def _end_mlflow_run_with_failure(context: HookContext):
             exception = context.op_exception
 
@@ -150,6 +142,7 @@ def _end_mlflow_run_with_failure(context: HookContext):
             name=get_pudl_model_job_name(experiment_name),
             config=default_config,
             hooks={_log_config_hook, _end_mlflow_run_with_failure},
+            resource_defs=model_resources,
         )
         def model_job(**kwargs):
             tracker_teardown = experiment_tracker_teardown_factory(
diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/model_jobs.py
index e6438ab..2a8918b 100644
--- a/src/mozilla_sec_eia/model_jobs.py
+++ b/src/mozilla_sec_eia/model_jobs.py
@@ -5,7 +5,7 @@
 import coloredlogs
 from dagster import Definitions
 
-from mozilla_sec_eia.library import get_ml_model_jobs, get_ml_model_resources
+from mozilla_sec_eia.library import get_ml_model_jobs
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
@@ -13,5 +13,4 @@
 
 defs = Definitions(
     jobs=get_ml_model_jobs(),
-    resources=get_ml_model_resources(),
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index a6fcf1e..795f988 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -18,14 +18,15 @@
     OpDefinition,
     OpExecutionContext,
     Out,
+    ResourceDefinition,
     graph,
     op,
 )
 from mlflow.entities import Run
 
 from mozilla_sec_eia.library.experiment_tracking import (
+    ExperimentTracker,
     get_most_recent_run,
-    get_tracking_resource_name,
 )
 from mozilla_sec_eia.library.models import pudl_model
 
@@ -117,73 +118,80 @@ class GetMostRecentRunResultsConfig(Config):
     continue_run: bool = False
 
 
+@op(
+    out={
+        "extraction_metadata": Out(),
+        "extracted": Out(),
+        "filings_to_extract": Out(),
+    },
+)
+def get_most_recent_run_results(
+    context: OpExecutionContext,
+    config: GetMostRecentRunResultsConfig,
+    experiment_tracker: ExperimentTracker,
+    filings_to_extract: pd.DataFrame,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Get results from previous run to continue extraction."""
+    extraction_metadata = pd.DataFrame(
+        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+    ).set_index("filename")
+    extracted = pd.DataFrame()
+
+    if config.continue_run:
+        most_recent_run = get_most_recent_run(
+            experiment_tracker.experiment_name, context.run_id
+        )
+        extraction_metadata = ExtractionMetadataSchema.validate(
+            _load_artifact_as_csv(
+                most_recent_run, "/extraction_metadata.csv"
+            ).set_index("filename")
+        )
+        extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
+        filings_to_extract = filings_to_extract[
+            ~filings_to_extract["filename"].isin(extraction_metadata.index)
+        ]
+
+    return extraction_metadata, extracted, filings_to_extract
+
+
+@op(required_resource_keys=["experiment_tracker"])
+def log_extraction_data(
+    metadata: pd.DataFrame,
+    extraction_metadata: list[pd.DataFrame],
+    extracted: list[pd.DataFrame],
+    previous_run_extraction_metadata: pd.DataFrame,
+    previous_run_extracted_data: pd.DataFrame,
+):
+    """Log results from extraction run."""
+    extraction_metadata = pd.concat(
+        extraction_metadata + [previous_run_extraction_metadata]
+    )
+    extracted = pd.concat(extracted + [previous_run_extracted_data])
+    # Use metadata to log generic metrics
+    extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
+    mlflow.log_metrics(
+        {
+            "num_failed": (~extraction_metadata["success"]).sum(),
+            "ratio_extracted": len(extraction_metadata) / len(metadata),
+        }
+    )
+
+    # Log the extraction results + metadata for future reference/analysis
+    _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
+    _log_artifact_as_parquet(extracted, "extracted.parquet")
+
+
 def extract_model_factory(
-    dataset_name: str, extract_op: OpDefinition | GraphDefinition
+    dataset_name: str,
+    extract_op: OpDefinition | GraphDefinition,
+    resources: dict[str, ResourceDefinition] = {},
 ):
     """Produce a `pudl_model` to extract data from sec10k filings."""
     experiment_name = f"{dataset_name}_extraction"
-    experiment_tracker_resource = get_tracking_resource_name(experiment_name)
-
-    @op(
-        required_resource_keys=[experiment_tracker_resource],
-        out={
-            "extraction_metadata": Out(),
-            "extracted": Out(),
-            "filings_to_extract": Out(),
-        },
-    )
-    def get_most_recent_run_results(
-        context: OpExecutionContext,
-        config: GetMostRecentRunResultsConfig,
-        filings_to_extract: pd.DataFrame,
-    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-        extraction_metadata = pd.DataFrame(
-            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-        ).set_index("filename")
-        extracted = pd.DataFrame()
-
-        if config.continue_run:
-            most_recent_run = get_most_recent_run(experiment_name, context.run_id)
-            extraction_metadata = ExtractionMetadataSchema.validate(
-                _load_artifact_as_csv(
-                    most_recent_run, "/extraction_metadata.csv"
-                ).set_index("filename")
-            )
-            extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
-            filings_to_extract = filings_to_extract[
-                ~filings_to_extract["filename"].isin(extraction_metadata.index)
-            ]
-
-        return extraction_metadata, extracted, filings_to_extract
-
-    @op(required_resource_keys=[experiment_tracker_resource])
-    def log_extraction_data(
-        metadata: pd.DataFrame,
-        extraction_metadata: list[pd.DataFrame],
-        extracted: list[pd.DataFrame],
-        previous_run_extraction_metadata: pd.DataFrame,
-        previous_run_extracted_data: pd.DataFrame,
-    ):
-        extraction_metadata = pd.concat(
-            extraction_metadata + [previous_run_extraction_metadata]
-        )
-        extracted = pd.concat(extracted + [previous_run_extracted_data])
-        # Use metadata to log generic metrics
-        extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
-        mlflow.log_metrics(
-            {
-                "num_failed": (~extraction_metadata["success"]).sum(),
-                "ratio_extracted": len(extraction_metadata) / len(metadata),
-            }
-        )
-
-        # Log the extraction results + metadata for future reference/analysis
-        _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
-        _log_artifact_as_parquet(extracted, "extracted.parquet")
 
     @pudl_model(
         experiment_name=experiment_name,
-        resources={"cloud_interface": cloud_interface_resource},
+        resources={"cloud_interface": cloud_interface_resource} | resources,
     )
     @graph(name=experiment_name)
     def extract_filings():
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index c5db638..925b5c1 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -7,7 +7,6 @@
 from dagster import Out, op
 from mozilla_sec_eia.library.experiment_tracking import (
     get_most_recent_run,
-    get_tracking_resource_name,
 )
 from mozilla_sec_eia.models.sec10k.extract import (
     ChunkFilingsConfig,
@@ -96,18 +95,20 @@ def get_metadata(self):
         def _fake_extract(_filings_to_extract):
             return results[0], results[1]
 
-        test_job = extract_model_factory(dataset_name, _fake_extract)
         resources = {
             "basic_10k_extract_config": ChunkFilingsConfig(
                 num_filings=3 if i == 0 else -1
             ),
-            get_tracking_resource_name(experiment_name): test_tracker,
+            "experiment_tracker": test_tracker,
             "cloud_interface": FakeArchive(),
         }
+        test_job = extract_model_factory(
+            dataset_name, _fake_extract, resources=resources
+        )
         metadata = results[0]
 
         # Run extract method
-        test_job.execute_in_process(resources=resources)
+        test_job.execute_in_process()
         run = get_most_recent_run(experiment_name, dagster_run_id="")
         assert run.data.metrics["num_failed"] == (~metadata["success"]).sum()
         assert run.data.metrics["ratio_extracted"] == len(metadata) / len(

From 93da0522ce593f3ff30a94775421f90082ff5ff8 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 17:10:41 -0400
Subject: [PATCH 011/161] Add mlflow artifact io-manager

---
 .../library/experiment_tracking/__init__.py   |   1 +
 .../experiment_tracking/mlflow_io_managers.py | 100 ++++++++
 .../experiment_tracking/mlflow_resource.py    |  41 ++--
 src/mozilla_sec_eia/library/models.py         |  24 +-
 src/mozilla_sec_eia/models/sec10k/extract.py  | 217 ++++++++++--------
 tests/conftest.py                             |  17 ++
 tests/unit/models/sec10k/extract_test.py      | 124 +++++++---
 7 files changed, 377 insertions(+), 147 deletions(-)
 create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py

diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
index 65cbf7a..5a468d7 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
@@ -1,5 +1,6 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
+from .mlflow_io_managers import MlflowPandasArtifactIOManager
 from .mlflow_resource import (
     ExperimentTracker,
     experiment_tracker_teardown_factory,
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
new file mode 100644
index 0000000..a1a6850
--- /dev/null
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
@@ -0,0 +1,100 @@
+"""Implement IO managers for loading models/artifacts from tracking server."""
+
+import io
+import logging
+import tempfile
+from pathlib import Path
+from typing import Literal
+
+import mlflow
+import pandas as pd
+from dagster import ConfigurableIOManager, InputContext, OutputContext
+from mlflow.entities import Run
+
+from .mlflow_resource import ExperimentTracker
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+class MlflowPandasArtifactIOManager(ConfigurableIOManager):
+    """Implement IO manager for logging/loading parquet files as mlflow artifacts."""
+
+    experiment_tracker: ExperimentTracker
+    #: By default handles artifacts from current run, but can be used with previous run.
+    use_previous_mlflow_run: bool = False
+    file_type: Literal["parquet", "csv"] = "parquet"
+
+    def _load_artifact_as_csv(self, run: Run, artifact_name: str) -> pd.DataFrame:
+        """Download a CSV and parse to DataFrame from mlflow tracking server."""
+        df = pd.read_csv(
+            io.StringIO(
+                mlflow.artifacts.load_text(run.info.artifact_uri + f"/{artifact_name}")
+            )
+        )
+        return df
+
+    def _log_artifact_as_csv(
+        self, artifact: pd.DataFrame, artifact_name: str, index: bool = True
+    ):
+        """Upload a DataFrame as a CSV to mlflow tracking server."""
+        return mlflow.log_text(artifact.to_csv(index=index), artifact_name)
+
+    def _load_artifact_as_parquet(self, run: Run, artifact_name: str) -> pd.DataFrame:
+        """Download a CSV and parse to DataFrame from mlflow tracking server."""
+        df = pd.read_parquet(run.info.artifact_uri + f"/{artifact_name}")
+        return df
+
+    def _log_artifact_as_parquet(
+        self, artifact: pd.DataFrame, artifact_name: str, index: bool = True
+    ):
+        """Upload a DataFrame as a CSV to mlflow tracking server."""
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            parquet_path = Path(tmp_dir) / artifact_name
+            artifact.to_parquet(parquet_path, index=index)
+            return mlflow.log_artifact(parquet_path, artifact_name)
+
+    def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str:
+        """Return dagster run id of current dagster run."""
+        return context.get_identifier()[0]
+
+    def handle_output(self, context: OutputContext, df: pd.DataFrame):
+        """Attach dataframe to run as artifact."""
+        if self.use_previous_mlflow_run:
+            raise NotImplementedError(
+                "MlflowPandasArtifactIOManager can not be used to add artifacts to completed run."
+            )
+
+        if self.file_type == "csv":
+            self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv")
+        else:
+            self._log_artifact_as_parquet(df, artifact_name=f"{context.name}.parquet")
+
+    def _get_run_info(self) -> Run:
+        """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run."""
+        dagster_run_id = self.experiment_tracker.get_run_id()
+        filter_string = f"tags.dagster_run_id='{dagster_run_id}'"
+        if self.use_previous_mlflow_run:
+            filter_string = f"tags.dagster_run_id!='{dagster_run_id}'"
+
+        run_metadata = mlflow.search_runs(
+            experiment_names=[self.experiment_tracker.experiment_name],
+            filter_string=filter_string,
+        )
+
+        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
+        return mlflow.get_run(run_metadata.loc[0, "run_id"])
+
+    def load_input(self, context: InputContext) -> pd.DataFrame:
+        """Handle loading dataframes from mlflow run artifacts."""
+        mlflow_run = self._get_run_info()
+
+        if self.file_type == "csv":
+            df = self._load_artifact_as_csv(
+                mlflow_run, artifact_name=f"{context.name}.csv"
+            )
+        else:
+            df = self._load_artifact_as_parquet(
+                mlflow_run, artifact_name=f"{context.name}.parquet"
+            )
+
+        return df
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
index f2c113f..744c013 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
@@ -15,8 +15,9 @@
 from contextlib import contextmanager
 
 import mlflow
-from dagster import ConfigurableResource, InitResourceContext, op
+from dagster import ConfigurableResource, In, InitResourceContext, Nothing, op
 from google.cloud import secretmanager
+from pydantic import PrivateAttr
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -50,15 +51,24 @@ class ExperimentTracker(ConfigurableResource):
     tags: dict = {}
     project: str
 
+    _run_id: str = PrivateAttr()
+
     @contextmanager
     def yield_for_execution(
         self,
         context: InitResourceContext,
     ) -> "ExperimentTracker":
         """Create experiment tracker for specified experiment."""
+        self._run_id = context.run_id
+
         if self.tracking_enabled:
             self._configure_mlflow()
 
+            # Hack to stop mlflow from ending run at process barrier
+            # This is borrowed from the official dagster mlflow resource found here:
+            # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py
+            atexit.unregister(mlflow.end_run)
+
             # Get run_id associated with current dagster run
             experiment_id = self.get_or_create_experiment(
                 experiment_name=self.experiment_name,
@@ -66,18 +76,18 @@ def yield_for_execution(
             )
             mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id)
 
-            # Hack to stop mlflow from ending run at process barrier
-            # This is borrowed from the official dagster mlflow resource found here:
-            # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py
-            atexit.unregister(mlflow.end_run)
-
-            # Create new run under specified experiment
-            with mlflow.start_run(
-                run_id=mlflow_run_id,
-                experiment_id=experiment_id,
-                tags=self.tags | {"dagster_run_id": context.run_id},
-            ):
+            if (active_run := mlflow.active_run()) is not None:
+                if active_run.info.run_id != mlflow_run_id:
+                    raise RuntimeError("Found conflicting active mlflow run!")
                 yield self
+            else:
+                # Create new run under specified experiment
+                with mlflow.start_run(
+                    run_id=mlflow_run_id,
+                    experiment_id=experiment_id,
+                    tags=self.tags | {"dagster_run_id": context.run_id},
+                ):
+                    yield self
 
     def _get_tracking_password(self, version_id: str = "latest"):
         """Get tracking server password from gcloud secrets."""
@@ -115,6 +125,10 @@ def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str):
             run_id = run_df.loc[0, "run_id"]
         return run_id
 
+    def get_run_id(self):
+        """Return current dagster run_id."""
+        return self._run_id
+
     @staticmethod
     def get_or_create_experiment(
         experiment_name: str, artifact_location: str = ""
@@ -147,8 +161,9 @@ def experiment_tracker_teardown_factory(
     @op(
         name=f"{experiment_name}_tracker_teardown",
         required_resource_keys=["experiment_tracker"],
+        ins={"model_done": In(Nothing)},
     )
-    def teardown_experiment_tracker(_results):
+    def teardown_experiment_tracker():
         mlflow.end_run()
 
     return teardown_experiment_tracker
diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 135df39..6cb7bd8 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -40,6 +40,7 @@
 
 from .experiment_tracking import (
     ExperimentTracker,
+    MlflowPandasArtifactIOManager,
     experiment_tracker_teardown_factory,
 )
 
@@ -92,6 +93,7 @@ def get_pudl_model_job_name(experiment_name: str) -> str:
 
 def pudl_model(
     experiment_name: str,
+    mlflow_pandas_io_manager_file_type: str = "parquet",
     resources: dict[str, ResourceDefinition] = {},
     config_from_yaml: bool = False,
 ) -> JobDefinition:
@@ -103,11 +105,21 @@ def _decorator(model_graph: GraphDefinition):
             model_config |= get_yml_config(model_graph.name)
 
         # Add resources to resource dict
+        experiment_tracker = ExperimentTracker(
+            experiment_name=experiment_name,
+            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+            project=EnvVar("GCS_PROJECT"),
+        )
         model_resources = {
-            "experiment_tracker": ExperimentTracker(
-                experiment_name=experiment_name,
-                tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-                project=EnvVar("GCS_PROJECT"),
+            "experiment_tracker": experiment_tracker,
+            "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
+                file_type=mlflow_pandas_io_manager_file_type,
+                experiment_tracker=experiment_tracker,
+            ),
+            "previous_run_mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
+                use_previous_mlflow_run=True,
+                file_type=mlflow_pandas_io_manager_file_type,
+                experiment_tracker=experiment_tracker,
             ),
         } | resources
 
@@ -151,9 +163,7 @@ def model_job(**kwargs):
             graph_output = model_graph(**kwargs)
 
             # Pass output to teardown to create a dependency
-            teardown = tracker_teardown(graph_output)
-
-            _collect_results(graph_output, [teardown])
+            tracker_teardown(graph_output)
 
         PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job
         return model_job
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 795f988..d7b9661 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -1,10 +1,7 @@
 """Implement top level extraction methods and tooling."""
 
-import io
 import logging
 import math
-import tempfile
-from pathlib import Path
 
 import mlflow
 import numpy as np
@@ -15,19 +12,15 @@
     DynamicOut,
     DynamicOutput,
     GraphDefinition,
+    GraphOut,
+    In,
     OpDefinition,
-    OpExecutionContext,
     Out,
-    ResourceDefinition,
+    Output,
     graph,
     op,
 )
-from mlflow.entities import Run
 
-from mozilla_sec_eia.library.experiment_tracking import (
-    ExperimentTracker,
-    get_most_recent_run,
-)
 from mozilla_sec_eia.library.models import pudl_model
 
 from . import basic_10k
@@ -48,39 +41,8 @@ class ExtractionMetadataSchema(pa.DataFrameModel):
     success: bool = pa.Field(coerce=True)
 
 
-def _load_artifact_as_csv(run: Run, artifact_name: str) -> pd.DataFrame:
-    """Download a CSV and parse to DataFrame from mlflow tracking server."""
-    df = pd.read_csv(
-        io.StringIO(mlflow.artifacts.load_text(run.info.artifact_uri + artifact_name))
-    )
-    return df
-
-
-def _log_artifact_as_csv(
-    artifact: pd.DataFrame, artifact_name: str, index: bool = True
-):
-    """Upload a DataFrame as a CSV to mlflow tracking server."""
-    return mlflow.log_text(artifact.to_csv(index=index), artifact_name)
-
-
-def _load_artifact_as_parquet(run: Run, artifact_name: str) -> pd.DataFrame:
-    """Download a CSV and parse to DataFrame from mlflow tracking server."""
-    df = pd.read_parquet(run.info.artifact_uri + artifact_name)
-    return df
-
-
-def _log_artifact_as_parquet(
-    artifact: pd.DataFrame, artifact_name: str, index: bool = True
-):
-    """Upload a DataFrame as a CSV to mlflow tracking server."""
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        parquet_path = Path(tmp_dir) / artifact_name
-        artifact.to_parquet(parquet_path, index=index)
-        return mlflow.log_artifact(parquet_path, artifact_name)
-
-
 @op
-def get_filings_to_extract(
+def get_filing_metadata(
     cloud_interface: GCSArchive,
 ) -> pd.DataFrame:
     """Return filing metadata."""
@@ -91,19 +53,14 @@ class ChunkFilingsConfig(Config):
     """Config how many filings are extracted and chunk_size for extraction."""
 
     chunk_size: int = 1000
-    num_filings: int = -1
 
 
 @op(out=DynamicOut())
 def chunk_filings(
     config: ChunkFilingsConfig,
-    metadata: pd.DataFrame,
+    filings_to_extract: pd.DataFrame,
 ) -> pd.DataFrame:
     """Split filings into chunks for parallel extraction."""
-    filings_to_extract = metadata
-    if config.num_filings > 0:
-        filings_to_extract = filings_to_extract.sample(config.num_filings)
-
     for i, chunk in enumerate(
         np.array_split(
             filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size)
@@ -119,42 +76,12 @@ class GetMostRecentRunResultsConfig(Config):
 
 
 @op(
+    required_resource_keys=["experiment_tracker"],
     out={
-        "extraction_metadata": Out(),
-        "extracted": Out(),
-        "filings_to_extract": Out(),
+        "extraction_metadata": Out(io_manager_key="mlflow_pandas_artifact_io_manager"),
+        "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"),
     },
 )
-def get_most_recent_run_results(
-    context: OpExecutionContext,
-    config: GetMostRecentRunResultsConfig,
-    experiment_tracker: ExperimentTracker,
-    filings_to_extract: pd.DataFrame,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """Get results from previous run to continue extraction."""
-    extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-    ).set_index("filename")
-    extracted = pd.DataFrame()
-
-    if config.continue_run:
-        most_recent_run = get_most_recent_run(
-            experiment_tracker.experiment_name, context.run_id
-        )
-        extraction_metadata = ExtractionMetadataSchema.validate(
-            _load_artifact_as_csv(
-                most_recent_run, "/extraction_metadata.csv"
-            ).set_index("filename")
-        )
-        extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet")
-        filings_to_extract = filings_to_extract[
-            ~filings_to_extract["filename"].isin(extraction_metadata.index)
-        ]
-
-    return extraction_metadata, extracted, filings_to_extract
-
-
-@op(required_resource_keys=["experiment_tracker"])
 def log_extraction_data(
     metadata: pd.DataFrame,
     extraction_metadata: list[pd.DataFrame],
@@ -176,29 +103,50 @@ def log_extraction_data(
         }
     )
 
-    # Log the extraction results + metadata for future reference/analysis
-    _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv")
-    _log_artifact_as_parquet(extracted, "extracted.parquet")
+    # Return metadata and extracted data (they'll be logged as artifacts by io-manager)
+    return extraction_metadata, extracted
+
+
+class FilingsToExtractConfig(Config):
+    """Define configuration for filtering filings to extract."""
+
+    num_filings: int = -1
+
+
+@op
+def get_filings_to_extract(
+    config: FilingsToExtractConfig,
+    filing_metadata: pd.DataFrame,
+    previous_extraction_metadata: pd.DataFrame,
+    previous_extracted: pd.DataFrame,
+):
+    """Filter out any previously extracted filings and sub-sample to `num_filings`."""
+    filings_to_extract = filing_metadata
+    if config.num_filings > 0:
+        filings_to_extract = filings_to_extract.sample(config.num_filings)
+
+    filings_to_extract = filings_to_extract[
+        ~filings_to_extract["filename"].isin(previous_extraction_metadata.index)
+    ]
+    return filings_to_extract
 
 
-def extract_model_factory(
+def extract_graph_factory(
     dataset_name: str,
     extract_op: OpDefinition | GraphDefinition,
-    resources: dict[str, ResourceDefinition] = {},
 ):
     """Produce a `pudl_model` to extract data from sec10k filings."""
     experiment_name = f"{dataset_name}_extraction"
 
-    @pudl_model(
-        experiment_name=experiment_name,
-        resources={"cloud_interface": cloud_interface_resource} | resources,
-    )
     @graph(name=experiment_name)
-    def extract_filings():
-        metadata = get_filings_to_extract()
-        previous_extraction_metadata, previous_extracted, filings_to_extract = (
-            get_most_recent_run_results(metadata)
+    def extract_filings(previous_extraction_metadata, previous_extracted):
+        metadata = get_filing_metadata()
+        filings_to_extract = get_filings_to_extract(
+            metadata,
+            previous_extraction_metadata,
+            previous_extracted,
         )
+
         filing_chunks = chunk_filings(filings_to_extract)
         extraction_metadata, extracted = filing_chunks.map(extract_op)
 
@@ -213,6 +161,84 @@ def extract_filings():
     return extract_filings
 
 
+@op(
+    ins={
+        "extraction_metadata": In(
+            input_manager_key="previous_run_mlflow_pandas_artifact_io_manager"
+        ),
+        "extracted": In(
+            input_manager_key="previous_run_mlflow_pandas_artifact_io_manager"
+        ),
+    }
+)
+def get_previous_run_data(
+    continue_previous_run, extraction_metadata: pd.DataFrame, extracted: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Return previous run data loaded by io-manager."""
+    extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
+
+    return extraction_metadata, extracted
+
+
+@op
+def get_empty_run_data(start_new_run):
+    """Return empty dataframes representing run metadata and extracted data."""
+    extraction_metadata = pd.DataFrame(
+        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+    ).set_index("filename")
+    extracted = pd.DataFrame()
+
+    return extraction_metadata, extracted
+
+
+class ContinuePreviousRunConfig(Config):
+    """Configure whether to continue a previous extraction run or not."""
+
+    continue_run: bool = False
+
+
+@op(out={"continue_run": Out(is_required=False), "new_run": Out(is_required=False)})
+def continue_previous_run(config: ContinuePreviousRunConfig):
+    """Create branch dictating whether a previous extraction run is continued or not."""
+    if config.continue_run:
+        yield Output(True, "continue_run")
+    else:
+        yield Output(True, "new_run")
+
+
+@op(out={"previous_run_extraction_metadata": Out(), "previous_extracted": Out()})
+def merge_branches(dfs: list[tuple[pd.DataFrame, pd.DataFrame]]):
+    """Merge branches created by `continue_previous_run` and return."""
+    dfs = dfs[0]
+    return dfs[0], dfs[1]
+
+
+@graph(
+    out={
+        "previous_run_extraction_metadata": GraphOut(),
+        "previous_extracted": GraphOut(),
+    }
+)
+def get_starting_data():
+    """Get previous run data if configured to do so."""
+    continue_run, new_run = continue_previous_run()
+    previous_data = get_previous_run_data(continue_run)
+    new_data = get_empty_run_data(new_run)
+    return merge_branches([previous_data, new_data])
+
+
+@pudl_model(
+    "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource}
+)
+@graph
+def basic_10k_extraction_model():
+    """Implement basic 10k extraction pudl_model."""
+    previous_extraction_metadata, previous_extracted = get_starting_data()
+    return extract_graph_factory("basic_10k", basic_10k.extract)(
+        previous_extraction_metadata, previous_extracted
+    )
+
+
 def compute_validation_metrics(
     computed_set: pd.DataFrame,
     validation_set: pd.DataFrame,
@@ -251,6 +277,3 @@ def compute_validation_metrics(
         "precision": true_positives / computed_len,
         "recall": true_positives / validation_len,
     }
-
-
-basic_10k_extract = extract_model_factory("basic_10k", basic_10k.extract)
diff --git a/tests/conftest.py b/tests/conftest.py
index c10e825..4e9b0f6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,6 +3,7 @@
 import logging
 from pathlib import Path
 
+import mlflow
 import pytest
 from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker
 
@@ -58,3 +59,19 @@ def factory(experiment_name: str) -> TestTracker:
         )
 
     return factory
+
+
+@pytest.fixture
+def get_most_recent_mlflow_run_factory():
+    def _get_run(experiment_name: str):
+        """Search mlflow for most recent run with specified experiment name."""
+        run_metadata = mlflow.search_runs(
+            experiment_names=[experiment_name],
+        )
+
+        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
+        # This assert will ensure this doesn't silently break if the ordering changes
+        assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max()
+        return mlflow.get_run(run_metadata.loc[0, "run_id"])
+
+    return _get_run
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 925b5c1..d4421fb 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -4,14 +4,14 @@
 
 import pandas as pd
 import pytest
-from dagster import Out, op
-from mozilla_sec_eia.library.experiment_tracking import (
-    get_most_recent_run,
+from dagster import Out, RunConfig, op
+from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import (
+    MlflowPandasArtifactIOManager,
 )
 from mozilla_sec_eia.models.sec10k.extract import (
-    ChunkFilingsConfig,
+    FilingsToExtractConfig,
     compute_validation_metrics,
-    extract_model_factory,
+    extract_graph_factory,
 )
 from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
 
@@ -63,11 +63,58 @@ def second_run_results():
     )
 
 
+@pytest.mark.parametrize(
+    "filings_metadata,previous_extraction_metadata,num_filings,num_failed",
+    [
+        (
+            pd.DataFrame(
+                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
+            ),
+            pd.DataFrame(
+                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+            ).set_index("filename"),
+            -1,
+            0,
+        ),
+        (
+            pd.DataFrame(
+                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
+            ),
+            pd.DataFrame(
+                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+            ).set_index("filename"),
+            -1,
+            3,
+        ),
+        (
+            pd.DataFrame(
+                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
+            ),
+            pd.DataFrame(
+                {"filename": ["filing1", "filing2"], "success": [True, True]}
+            ).set_index("filename"),
+            -1,
+            0,
+        ),
+        (
+            pd.DataFrame(
+                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
+            ),
+            pd.DataFrame(
+                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+            ).set_index("filename"),
+            2,
+            1,
+        ),
+    ],
+)
 def test_sec10k_extract_pipeline(
     filings_metadata,
-    first_run_results,
-    second_run_results,
+    previous_extraction_metadata,
+    num_filings,
+    num_failed,
     test_tracker_factory,
+    get_most_recent_mlflow_run_factory,
 ):
     """Test high level extraction workflow."""
 
@@ -85,35 +132,52 @@ def setup_for_execution(self, context):
         def get_metadata(self):
             return filings_metadata
 
+    @op(out={"extraction_metadata": Out(), "extracted": Out()})
+    def test_extract(
+        cloud_interface: GCSArchive,
+        filings_to_extract: pd.DataFrame,
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        md = filings_to_extract
+        md["success"] = True
+        md.iloc[:num_failed, 1] = False
+        return md.set_index("filename"), pd.DataFrame()
+
     dataset_name = "test_pipeline"
     experiment_name = f"{dataset_name}_extraction"
     test_tracker = test_tracker_factory(experiment_name)
 
-    for i, results in enumerate([first_run_results, second_run_results]):
-
-        @op(out={"extraction_metadata": Out(), "extracted": Out()})
-        def _fake_extract(_filings_to_extract):
-            return results[0], results[1]
-
-        resources = {
-            "basic_10k_extract_config": ChunkFilingsConfig(
-                num_filings=3 if i == 0 else -1
+    test_graph = extract_graph_factory("test_extract", test_extract)
+    resources = {
+        "experiment_tracker": test_tracker,
+        "cloud_interface": FakeArchive(),
+        "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
+            experiment_tracker=test_tracker
+        ),
+    }
+    extraction_metadata = (
+        test_graph.to_job()
+        .execute_in_process(
+            resources=resources,
+            run_config=RunConfig(
+                {
+                    "get_filings_to_extract": FilingsToExtractConfig(
+                        num_filings=num_filings
+                    )
+                }
             ),
-            "experiment_tracker": test_tracker,
-            "cloud_interface": FakeArchive(),
-        }
-        test_job = extract_model_factory(
-            dataset_name, _fake_extract, resources=resources
-        )
-        metadata = results[0]
-
-        # Run extract method
-        test_job.execute_in_process()
-        run = get_most_recent_run(experiment_name, dagster_run_id="")
-        assert run.data.metrics["num_failed"] == (~metadata["success"]).sum()
-        assert run.data.metrics["ratio_extracted"] == len(metadata) / len(
-            filings_metadata
+            input_values={
+                "previous_extraction_metadata": previous_extraction_metadata,
+                "previous_extracted": pd.DataFrame(),
+            },
         )
+        .output_value()
+    )
+
+    run = get_most_recent_mlflow_run_factory(experiment_name)
+    assert run.data.metrics["num_failed"] == num_failed
+    assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len(
+        filings_metadata
+    )
 
 
 @pytest.mark.parametrize(

From 07713e910441175d9bb4e86d37bf0cfdf62f75f7 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 28 Aug 2024 22:38:47 -0400
Subject: [PATCH 012/161] Simplify pudl_models decorator

---
 src/mozilla_sec_eia/library/models.py | 42 ++++++++++++---------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 6cb7bd8..427d21d 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -33,7 +33,6 @@
     RunConfig,
     failure_hook,
     job,
-    op,
     success_hook,
 )
 from mlflow.entities.run_status import RunStatus
@@ -91,6 +90,24 @@ def get_pudl_model_job_name(experiment_name: str) -> str:
     return f"{experiment_name}_job"
 
 
+@success_hook(required_resource_keys={"experiment_tracker"})
+def _log_config_hook(context: HookContext):
+    if (config := context.op_config) is not None:
+        mlflow.log_params(
+            {f"{context.op.name}.{param}": value for param, value in config.items()}
+        )
+
+
+@failure_hook(required_resource_keys={"experiment_tracker"})
+def _end_mlflow_run_with_failure(context: HookContext):
+    exception = context.op_exception
+
+    if isinstance(exception, KeyboardInterrupt):
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
+    else:
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
+
+
 def pudl_model(
     experiment_name: str,
     mlflow_pandas_io_manager_file_type: str = "parquet",
@@ -127,29 +144,6 @@ def _decorator(model_graph: GraphDefinition):
             ops=model_config,
         )
 
-        @op
-        def _collect_results(model_graph_output, _implicit_dependencies: list):
-            return model_graph_output
-
-        @success_hook(required_resource_keys={"experiment_tracker"})
-        def _log_config_hook(context: HookContext):
-            if (config := context.op_config) is not None:
-                mlflow.log_params(
-                    {
-                        f"{context.op.name}.{param}": value
-                        for param, value in config.items()
-                    }
-                )
-
-        @failure_hook(required_resource_keys={"experiment_tracker"})
-        def _end_mlflow_run_with_failure(context: HookContext):
-            exception = context.op_exception
-
-            if isinstance(exception, KeyboardInterrupt):
-                mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
-            else:
-                mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
-
         @job(
             name=get_pudl_model_job_name(experiment_name),
             config=default_config,

From 5d89ec6e7bf7a7246581c084bdeae842659bb45c Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 29 Aug 2024 09:37:09 -0400
Subject: [PATCH 013/161] Split extraction logging into two funcs

---
 src/mozilla_sec_eia/models/sec10k/extract.py | 46 ++++++++++++-------
 tests/unit/models/sec10k/extract_test.py     | 47 +-------------------
 2 files changed, 31 insertions(+), 62 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index d7b9661..f759ddf 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -75,6 +75,23 @@ class GetMostRecentRunResultsConfig(Config):
     continue_run: bool = False
 
 
+@op(required_resource_keys=["experiment_tracker"])
+def log_extraction_data(
+    metadata: pd.DataFrame,
+    extraction_metadata: pd.DataFrame,
+    extracted: pd.DataFrame,
+):
+    """Log results from extraction run."""
+    mlflow.log_metrics(
+        {
+            "num_failed": (~extraction_metadata["success"]).sum(),
+            "ratio_extracted": len(extraction_metadata) / len(metadata),
+        }
+    )
+
+    return extraction_metadata, extracted
+
+
 @op(
     required_resource_keys=["experiment_tracker"],
     out={
@@ -82,28 +99,20 @@ class GetMostRecentRunResultsConfig(Config):
         "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"),
     },
 )
-def log_extraction_data(
-    metadata: pd.DataFrame,
+def merge_extracted_data(
     extraction_metadata: list[pd.DataFrame],
     extracted: list[pd.DataFrame],
     previous_run_extraction_metadata: pd.DataFrame,
     previous_run_extracted_data: pd.DataFrame,
 ):
-    """Log results from extraction run."""
+    """Data is extracted in parallel ops, merge these plus any data from previous run."""
     extraction_metadata = pd.concat(
         extraction_metadata + [previous_run_extraction_metadata]
     )
     extracted = pd.concat(extracted + [previous_run_extracted_data])
     # Use metadata to log generic metrics
     extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
-    mlflow.log_metrics(
-        {
-            "num_failed": (~extraction_metadata["success"]).sum(),
-            "ratio_extracted": len(extraction_metadata) / len(metadata),
-        }
-    )
 
-    # Return metadata and extracted data (they'll be logged as artifacts by io-manager)
     return extraction_metadata, extracted
 
 
@@ -149,15 +158,19 @@ def extract_filings(previous_extraction_metadata, previous_extracted):
 
         filing_chunks = chunk_filings(filings_to_extract)
         extraction_metadata, extracted = filing_chunks.map(extract_op)
-
-        return log_extraction_data(
-            metadata,
+        extraction_metadata, extracted = merge_extracted_data(
             extraction_metadata.collect(),
             extracted.collect(),
             previous_extraction_metadata,
             previous_extracted,
         )
 
+        return log_extraction_data(
+            metadata,
+            extraction_metadata,
+            extracted,
+        )
+
     return extract_filings
 
 
@@ -227,6 +240,9 @@ def get_starting_data():
     return merge_branches([previous_data, new_data])
 
 
+basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract)
+
+
 @pudl_model(
     "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource}
 )
@@ -234,9 +250,7 @@ def get_starting_data():
 def basic_10k_extraction_model():
     """Implement basic 10k extraction pudl_model."""
     previous_extraction_metadata, previous_extracted = get_starting_data()
-    return extract_graph_factory("basic_10k", basic_10k.extract)(
-        previous_extraction_metadata, previous_extracted
-    )
+    return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted)
 
 
 def compute_validation_metrics(
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index d4421fb..23ce092 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -18,51 +18,6 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-@pytest.fixture
-def filings_metadata() -> pd.DataFrame:
-    """Return fake filing metadata."""
-    return pd.DataFrame(
-        {
-            "filename": [
-                "filing1",
-                "filing2",
-                "filing3",
-                "filing3",
-                "filing4",
-                "filing5",
-            ],
-        }
-    )
-
-
-@pytest.fixture
-def first_run_results():
-    """Metadata and extracted table from first run of extractor."""
-    return (
-        pd.DataFrame(
-            {
-                "filename": ["filing1", "filing2", "filing3"],
-                "success": [True, True, False],
-            }
-        ).set_index("filename"),
-        pd.DataFrame({"column": ["extracted table (not needed for test)"]}),
-    )
-
-
-@pytest.fixture
-def second_run_results():
-    """Metadata and extracted table from first run of extractor."""
-    return (
-        pd.DataFrame(
-            {
-                "filename": ["filing1", "filing2", "filing3", "filing4", "filing5"],
-                "success": [True, True, False, True, True],
-            },
-        ).set_index("filename"),
-        pd.DataFrame({"column": ["extracted table (not needed for test)"]}),
-    )
-
-
 @pytest.mark.parametrize(
     "filings_metadata,previous_extraction_metadata,num_filings,num_failed",
     [
@@ -154,7 +109,7 @@ def test_extract(
             experiment_tracker=test_tracker
         ),
     }
-    extraction_metadata = (
+    extraction_metadata, extracted = (
         test_graph.to_job()
         .execute_in_process(
             resources=resources,

From c57818a3f416cc4e2e4ec0e46e25e3fb0ef2231b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 29 Aug 2024 10:06:34 -0400
Subject: [PATCH 014/161] Add mlflow metrics io-manager

---
 .../library/experiment_tracking/__init__.py   |  2 +-
 .../experiment_tracking/mlflow_io_managers.py | 54 +++++++++++++------
 src/mozilla_sec_eia/library/models.py         |  4 ++
 src/mozilla_sec_eia/models/sec10k/extract.py  | 27 +++++-----
 tests/unit/models/sec10k/extract_test.py      | 35 ++++++------
 5 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
index 5a468d7..edf7817 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
@@ -1,6 +1,6 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
-from .mlflow_io_managers import MlflowPandasArtifactIOManager
+from .mlflow_io_managers import MlflowMetricsIOManager, MlflowPandasArtifactIOManager
 from .mlflow_resource import (
     ExperimentTracker,
     experiment_tracker_teardown_factory,
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
index a1a6850..bc42138 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
@@ -16,12 +16,32 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-class MlflowPandasArtifactIOManager(ConfigurableIOManager):
-    """Implement IO manager for logging/loading parquet files as mlflow artifacts."""
+class MlflowBaseIOManager(ConfigurableIOManager):
+    """Specify base config and implement helper functions for mlflow io-managers."""
 
     experiment_tracker: ExperimentTracker
     #: By default handles artifacts from current run, but can be used with previous run.
     use_previous_mlflow_run: bool = False
+
+    def _get_run_info(self) -> Run:
+        """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run."""
+        dagster_run_id = self.experiment_tracker.get_run_id()
+        filter_string = f"tags.dagster_run_id='{dagster_run_id}'"
+        if self.use_previous_mlflow_run:
+            filter_string = f"tags.dagster_run_id!='{dagster_run_id}'"
+
+        run_metadata = mlflow.search_runs(
+            experiment_names=[self.experiment_tracker.experiment_name],
+            filter_string=filter_string,
+        )
+
+        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
+        return mlflow.get_run(run_metadata.loc[0, "run_id"])
+
+
+class MlflowPandasArtifactIOManager(MlflowBaseIOManager):
+    """Implement IO manager for logging/loading parquet files as mlflow artifacts."""
+
     file_type: Literal["parquet", "csv"] = "parquet"
 
     def _load_artifact_as_csv(self, run: Run, artifact_name: str) -> pd.DataFrame:
@@ -69,21 +89,6 @@ def handle_output(self, context: OutputContext, df: pd.DataFrame):
         else:
             self._log_artifact_as_parquet(df, artifact_name=f"{context.name}.parquet")
 
-    def _get_run_info(self) -> Run:
-        """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run."""
-        dagster_run_id = self.experiment_tracker.get_run_id()
-        filter_string = f"tags.dagster_run_id='{dagster_run_id}'"
-        if self.use_previous_mlflow_run:
-            filter_string = f"tags.dagster_run_id!='{dagster_run_id}'"
-
-        run_metadata = mlflow.search_runs(
-            experiment_names=[self.experiment_tracker.experiment_name],
-            filter_string=filter_string,
-        )
-
-        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
-        return mlflow.get_run(run_metadata.loc[0, "run_id"])
-
     def load_input(self, context: InputContext) -> pd.DataFrame:
         """Handle loading dataframes from mlflow run artifacts."""
         mlflow_run = self._get_run_info()
@@ -98,3 +103,18 @@ def load_input(self, context: InputContext) -> pd.DataFrame:
             )
 
         return df
+
+
+class MlflowMetricsIOManager(MlflowBaseIOManager):
+    """Log/load models from mlflow tracking server."""
+
+    experiment_tracker: ExperimentTracker
+
+    def handle_output(self, context: OutputContext, obj: dict[str, float]):
+        """Log metrics to mlflow run/experiment from `experiment_tracker`."""
+        mlflow.log_metrics(obj)
+
+    def load_input(self, context: OutputContext) -> dict[str, float]:
+        """Log metrics to mlflow run/experiment from `experiment_tracker`."""
+        run = self._get_run_info()
+        return run.data.metrics
diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py
index 427d21d..f506d76 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models.py
@@ -39,6 +39,7 @@
 
 from .experiment_tracking import (
     ExperimentTracker,
+    MlflowMetricsIOManager,
     MlflowPandasArtifactIOManager,
     experiment_tracker_teardown_factory,
 )
@@ -138,6 +139,9 @@ def _decorator(model_graph: GraphDefinition):
                 file_type=mlflow_pandas_io_manager_file_type,
                 experiment_tracker=experiment_tracker,
             ),
+            "mlflow_metrics_io_manager": MlflowMetricsIOManager(
+                experiment_tracker=experiment_tracker,
+            ),
         } | resources
 
         default_config = RunConfig(
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index f759ddf..53e3b41 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -3,7 +3,6 @@
 import logging
 import math
 
-import mlflow
 import numpy as np
 import pandas as pd
 import pandera as pa
@@ -75,21 +74,17 @@ class GetMostRecentRunResultsConfig(Config):
     continue_run: bool = False
 
 
-@op(required_resource_keys=["experiment_tracker"])
+@op(out={"basic_extraction_metrics": Out(io_manager_key="mlflow_metrics_io_manager")})
 def log_extraction_data(
     metadata: pd.DataFrame,
     extraction_metadata: pd.DataFrame,
     extracted: pd.DataFrame,
 ):
     """Log results from extraction run."""
-    mlflow.log_metrics(
-        {
-            "num_failed": (~extraction_metadata["success"]).sum(),
-            "ratio_extracted": len(extraction_metadata) / len(metadata),
-        }
-    )
-
-    return extraction_metadata, extracted
+    return {
+        "num_failed": (~extraction_metadata["success"]).sum(),
+        "ratio_extracted": len(extraction_metadata) / len(metadata),
+    }
 
 
 @op(
@@ -147,7 +142,14 @@ def extract_graph_factory(
     """Produce a `pudl_model` to extract data from sec10k filings."""
     experiment_name = f"{dataset_name}_extraction"
 
-    @graph(name=experiment_name)
+    @graph(
+        name=experiment_name,
+        out={
+            "extraction_metadata": GraphOut(),
+            "extracted": GraphOut(),
+            "extraction_metrics": GraphOut(),
+        },
+    )
     def extract_filings(previous_extraction_metadata, previous_extracted):
         metadata = get_filing_metadata()
         filings_to_extract = get_filings_to_extract(
@@ -165,11 +167,12 @@ def extract_filings(previous_extraction_metadata, previous_extracted):
             previous_extracted,
         )
 
-        return log_extraction_data(
+        extraction_metrics = log_extraction_data(
             metadata,
             extraction_metadata,
             extracted,
         )
+        return extraction_metadata, extracted, extraction_metrics
 
     return extract_filings
 
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 23ce092..2b41936 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -6,6 +6,7 @@
 import pytest
 from dagster import Out, RunConfig, op
 from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import (
+    MlflowMetricsIOManager,
     MlflowPandasArtifactIOManager,
 )
 from mozilla_sec_eia.models.sec10k.extract import (
@@ -108,24 +109,23 @@ def test_extract(
         "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
             experiment_tracker=test_tracker
         ),
+        "mlflow_metrics_io_manager": MlflowMetricsIOManager(
+            experiment_tracker=test_tracker,
+        ),
     }
-    extraction_metadata, extracted = (
-        test_graph.to_job()
-        .execute_in_process(
-            resources=resources,
-            run_config=RunConfig(
-                {
-                    "get_filings_to_extract": FilingsToExtractConfig(
-                        num_filings=num_filings
-                    )
-                }
-            ),
-            input_values={
-                "previous_extraction_metadata": previous_extraction_metadata,
-                "previous_extracted": pd.DataFrame(),
-            },
-        )
-        .output_value()
+    graph_result = test_graph.to_job().execute_in_process(
+        resources=resources,
+        run_config=RunConfig(
+            {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)}
+        ),
+        input_values={
+            "previous_extraction_metadata": previous_extraction_metadata,
+            "previous_extracted": pd.DataFrame(),
+        },
+    )
+    extraction_metadata, metrics = (
+        graph_result.output_value("extraction_metadata"),
+        graph_result.output_value("extraction_metrics"),
     )
 
     run = get_most_recent_mlflow_run_factory(experiment_name)
@@ -133,6 +133,7 @@ def test_extract(
     assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len(
         filings_metadata
     )
+    assert run.data.metrics == metrics
 
 
 @pytest.mark.parametrize(

From 625783b018ec8005658c818f1ea58e39810509f2 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 29 Aug 2024 11:52:42 -0400
Subject: [PATCH 015/161] Change pudl_model to pudl_pipeline

---
 src/mozilla_sec_eia/library/__init__.py       |  9 +--
 .../library/experiment_tracking/__init__.py   | 31 ++++++-
 .../library/experiment_tracking/validation.py | 58 ++++++++++++++
 .../library/models/__init__.py                |  3 +
 .../{models.py => models/pipelines.py}        | 80 ++++++++-----------
 src/mozilla_sec_eia/models/sec10k/extract.py  | 51 ++----------
 .../{model_jobs.py => pudl_pipelines.py}      |  4 +-
 tests/unit/models/sec10k/extract_test.py      | 74 -----------------
 8 files changed, 137 insertions(+), 173 deletions(-)
 create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/validation.py
 create mode 100644 src/mozilla_sec_eia/library/models/__init__.py
 rename src/mozilla_sec_eia/library/{models.py => models/pipelines.py} (69%)
 rename src/mozilla_sec_eia/{model_jobs.py => pudl_pipelines.py} (77%)

diff --git a/src/mozilla_sec_eia/library/__init__.py b/src/mozilla_sec_eia/library/__init__.py
index f3448ee..3bd8694 100644
--- a/src/mozilla_sec_eia/library/__init__.py
+++ b/src/mozilla_sec_eia/library/__init__.py
@@ -3,11 +3,6 @@
 from . import models
 
 
-def get_ml_model_resources():
-    """Return default configuration for all PUDL models."""
-    return models.MODEL_RESOURCES
-
-
-def get_ml_model_jobs() -> list[str]:
+def get_ml_pipeline_jobs() -> list[str]:
     """Return all jobs created through `pudl_model` decorator."""
-    return list(models.PUDL_MODELS.values())
+    return list(models.PUDL_PIPELINES.values())
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
index edf7817..fe6a070 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
@@ -1,8 +1,37 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
-from .mlflow_io_managers import MlflowMetricsIOManager, MlflowPandasArtifactIOManager
+from .mlflow_io_managers import (
+    MlflowBaseIOManager,
+    MlflowMetricsIOManager,
+    MlflowPandasArtifactIOManager,
+)
 from .mlflow_resource import (
     ExperimentTracker,
     experiment_tracker_teardown_factory,
     get_most_recent_run,
 )
+
+
+def get_mlflow_io_manager(
+    key: str, experiment_tracker: ExperimentTracker, pandas_file_type: str = "parquet"
+) -> MlflowBaseIOManager:
+    """Construct IO-manager based on key."""
+    if key == "mlflow_pandas_artifact_io_manager":
+        io_manager = MlflowPandasArtifactIOManager(
+            file_type=pandas_file_type,
+            experiment_tracker=experiment_tracker,
+        )
+    elif key == "previous_run_mlflow_pandas_artifact_io_manager":
+        io_manager = MlflowPandasArtifactIOManager(
+            file_type=pandas_file_type,
+            experiment_tracker=experiment_tracker,
+            use_previous_mlflow_run=True,
+        )
+    elif key == "mlflow_metrics_io_manager":
+        io_manager = MlflowMetricsIOManager(
+            experiment_tracker=experiment_tracker,
+        )
+    else:
+        raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.")
+
+    return io_manager
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py
new file mode 100644
index 0000000..c8aebe1
--- /dev/null
+++ b/src/mozilla_sec_eia/library/experiment_tracking/validation.py
@@ -0,0 +1,58 @@
+"""Implement common utilities/functions for validating models."""
+
+import pandas as pd
+from dagster import OpDefinition, Out, op
+
+
+def _pandas_compute_precision_recall(
+    computed_set: pd.DataFrame,
+    validation_set: pd.DataFrame,
+    value_col: str,
+) -> dict:
+    """Compute precision and recall for extraction compared to validation set.
+
+    Arg:
+        computed_set: Extracted data.
+        validation_set: Expected extraction results.
+        value_col: Column to compare when computing metrics.
+    """
+    # Get initial length of both sets
+    computed_len = len(computed_set)
+    validation_len = len(validation_set)
+
+    # Get index of rows only in one set and make Null in other set
+    idx_validation_only = validation_set.index.difference(computed_set.index)
+    padded_compute_set = pd.concat(
+        [
+            computed_set[value_col],
+            pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
+        ]
+    ).sort_index()
+    idx_compute_only = computed_set.index.difference(validation_set.index)
+    padded_validation_set = pd.concat(
+        [
+            validation_set[value_col],
+            pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
+        ]
+    ).sort_index()
+
+    true_positives = (padded_compute_set == padded_validation_set).sum()
+
+    return {
+        "precision": true_positives / computed_len,
+        "recall": true_positives / validation_len,
+    }
+
+
+def pandas_precision_recall_op_factory(value_col: str) -> OpDefinition:
+    """Return an op that will compute precision/recall on `value_col` of dataframe."""
+
+    @op(
+        out={
+            "precision_recall_metrics": Out(io_manager_key="mlflow_metrics_io_manager")
+        }
+    )
+    def _precision_recall_op(computed_set: pd.DataFrame, validation_set: pd.DataFrame):
+        return _pandas_compute_precision_recall(computed_set, validation_set, value_col)
+
+    return _precision_recall_op
diff --git a/src/mozilla_sec_eia/library/models/__init__.py b/src/mozilla_sec_eia/library/models/__init__.py
new file mode 100644
index 0000000..3deb3bb
--- /dev/null
+++ b/src/mozilla_sec_eia/library/models/__init__.py
@@ -0,0 +1,3 @@
+"""Implement top level framework and utilities for defining pudl models/pipelines."""
+
+from .pipelines import PUDL_PIPELINES, PudlPipelineConfig, pudl_pipeline
diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models/pipelines.py
similarity index 69%
rename from src/mozilla_sec_eia/library/models.py
rename to src/mozilla_sec_eia/library/models/pipelines.py
index f506d76..48a0a8f 100644
--- a/src/mozilla_sec_eia/library/models.py
+++ b/src/mozilla_sec_eia/library/models/pipelines.py
@@ -18,11 +18,9 @@
 yaml configuration, but will only be used for a single run.
 """
 
-import importlib
 import logging
 
 import mlflow
-import yaml
 from dagster import (
     EnvVar,
     GraphDefinition,
@@ -36,30 +34,16 @@
     success_hook,
 )
 from mlflow.entities.run_status import RunStatus
+from pydantic import BaseModel
 
-from .experiment_tracking import (
+from ..experiment_tracking import (
     ExperimentTracker,
-    MlflowMetricsIOManager,
-    MlflowPandasArtifactIOManager,
     experiment_tracker_teardown_factory,
+    get_mlflow_io_manager,
 )
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
-MODEL_RESOURCES = {}
-PUDL_MODELS = {}
-
-
-def get_yml_config(experiment_name: str) -> dict:
-    """Load model configuration from yaml file."""
-    config_file = (
-        importlib.resources.files("pudl.package_data.settings") / "pudl_models.yml"
-    )
-    config = yaml.safe_load(config_file.open("r"))
-
-    if not (model_config := config.get(experiment_name)):
-        raise RuntimeError(f"No {experiment_name} entry in {config_file}")
-
-    return {experiment_name: model_config}
+PUDL_PIPELINES = {}
 
 
 def get_default_config(model_graph: GraphDefinition) -> dict:
@@ -86,7 +70,7 @@ def _get_default_from_ops(node: OpDefinition | GraphDefinition):
     return config
 
 
-def get_pudl_model_job_name(experiment_name: str) -> str:
+def get_pudl_pipeline_job_name(experiment_name: str) -> str:
     """Return expected pudl model job name based on experiment_name."""
     return f"{experiment_name}_job"
 
@@ -109,47 +93,51 @@ def _end_mlflow_run_with_failure(context: HookContext):
         mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
 
 
-def pudl_model(
-    experiment_name: str,
-    mlflow_pandas_io_manager_file_type: str = "parquet",
+class PudlPipelineConfig(BaseModel):
+    """Define a config format for `pudl_pipeline`'s."""
+
+    experiment_name: str
+    op_config: dict = {}
+    required_mlflow_io_managers: list[str] = [
+        "mlflow_pandas_artifact_io_manager",
+        "previous_run_mlflow_pandas_artifact_io_manager",
+        "mlflow_metrics_io_manager",
+    ]
+    pandas_io_file_type: str = "parquet"
+
+
+def pudl_pipeline(
+    config: PudlPipelineConfig,
     resources: dict[str, ResourceDefinition] = {},
-    config_from_yaml: bool = False,
 ) -> JobDefinition:
     """Decorator for an ML model that will handle providing configuration to dagster."""
 
     def _decorator(model_graph: GraphDefinition):
-        model_config = get_default_config(model_graph)
-        if config_from_yaml:
-            model_config |= get_yml_config(model_graph.name)
+        model_config = get_default_config(model_graph) | config.op_config
 
         # Add resources to resource dict
         experiment_tracker = ExperimentTracker(
-            experiment_name=experiment_name,
+            experiment_name=config.experiment_name,
             tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
             project=EnvVar("GCS_PROJECT"),
         )
-        model_resources = {
-            "experiment_tracker": experiment_tracker,
-            "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
-                file_type=mlflow_pandas_io_manager_file_type,
-                experiment_tracker=experiment_tracker,
-            ),
-            "previous_run_mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
-                use_previous_mlflow_run=True,
-                file_type=mlflow_pandas_io_manager_file_type,
-                experiment_tracker=experiment_tracker,
-            ),
-            "mlflow_metrics_io_manager": MlflowMetricsIOManager(
-                experiment_tracker=experiment_tracker,
-            ),
-        } | resources
+        model_resources = (
+            {"experiment_tracker": experiment_tracker}
+            | {
+                key: get_mlflow_io_manager(
+                    key, experiment_tracker, config.pandas_io_file_type
+                )
+                for key in config.required_mlflow_io_managers
+            }
+            | resources
+        )
 
         default_config = RunConfig(
             ops=model_config,
         )
 
         @job(
-            name=get_pudl_model_job_name(experiment_name),
+            name=get_pudl_pipeline_job_name(config.experiment_name),
             config=default_config,
             hooks={_log_config_hook, _end_mlflow_run_with_failure},
             resource_defs=model_resources,
@@ -163,7 +151,7 @@ def model_job(**kwargs):
             # Pass output to teardown to create a dependency
             tracker_teardown(graph_output)
 
-        PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job
+        PUDL_PIPELINES[get_pudl_pipeline_job_name(config.experiment_name)] = model_job
         return model_job
 
     return _decorator
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 53e3b41..676ec38 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -20,7 +20,7 @@
     op,
 )
 
-from mozilla_sec_eia.library.models import pudl_model
+from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline
 
 from . import basic_10k
 from .utils.cloud import GCSArchive, cloud_interface_resource
@@ -246,51 +246,16 @@ def get_starting_data():
 basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract)
 
 
-@pudl_model(
-    "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource}
+basic_10k_extract_config = PudlPipelineConfig(
+    experiment_name="basic_10k_extraction",
+)
+
+
+@pudl_pipeline(
+    basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource}
 )
 @graph
 def basic_10k_extraction_model():
     """Implement basic 10k extraction pudl_model."""
     previous_extraction_metadata, previous_extracted = get_starting_data()
     return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted)
-
-
-def compute_validation_metrics(
-    computed_set: pd.DataFrame,
-    validation_set: pd.DataFrame,
-    value_col: str,
-) -> dict:
-    """Compute precision and recall for extraction compared to validation set.
-
-    Arg:
-        computed_set: Extracted data.
-        validation_set: Expected extraction results.
-        value_col: Column to compare when computing metrics.
-    """
-    # Get initial length of both sets
-    computed_len = len(computed_set)
-    validation_len = len(validation_set)
-
-    # Get index of rows only in one set and make Null in other set
-    idx_validation_only = validation_set.index.difference(computed_set.index)
-    padded_compute_set = pd.concat(
-        [
-            computed_set[value_col],
-            pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
-        ]
-    ).sort_index()
-    idx_compute_only = computed_set.index.difference(validation_set.index)
-    padded_validation_set = pd.concat(
-        [
-            validation_set[value_col],
-            pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
-        ]
-    ).sort_index()
-
-    true_positives = (padded_compute_set == padded_validation_set).sum()
-
-    return {
-        "precision": true_positives / computed_len,
-        "recall": true_positives / validation_len,
-    }
diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/pudl_pipelines.py
similarity index 77%
rename from src/mozilla_sec_eia/model_jobs.py
rename to src/mozilla_sec_eia/pudl_pipelines.py
index 2a8918b..42cf090 100644
--- a/src/mozilla_sec_eia/model_jobs.py
+++ b/src/mozilla_sec_eia/pudl_pipelines.py
@@ -5,12 +5,12 @@
 import coloredlogs
 from dagster import Definitions
 
-from mozilla_sec_eia.library import get_ml_model_jobs
+from mozilla_sec_eia.library import get_ml_pipeline_jobs
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
 defs = Definitions(
-    jobs=get_ml_model_jobs(),
+    jobs=get_ml_pipeline_jobs(),
 )
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 2b41936..7d7e716 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -11,7 +11,6 @@
 )
 from mozilla_sec_eia.models.sec10k.extract import (
     FilingsToExtractConfig,
-    compute_validation_metrics,
     extract_graph_factory,
 )
 from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
@@ -134,76 +133,3 @@ def test_extract(
         filings_metadata
     )
     assert run.data.metrics == metrics
-
-
-@pytest.mark.parametrize(
-    "computed_set,validation_set,expected_precision,expected_recall",
-    [
-        (
-            pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]),
-            pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]),
-            1,
-            1,
-        ),
-        (
-            pd.DataFrame({"value": ["a", "b", "c", "d"]}, index=[0, 1, 2, 3]),
-            pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]),
-            3 / 4,
-            1,
-        ),
-        (
-            pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]),
-            pd.DataFrame({"value": ["a", "b", "c", "d"]}, index=[0, 1, 2, 3]),
-            1,
-            3 / 4,
-        ),
-        (
-            pd.DataFrame({"value": ["a", "b", "d"]}, index=[0, 1, 2]),
-            pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]),
-            2 / 3,
-            2 / 3,
-        ),
-        (
-            pd.DataFrame(
-                {"value": ["a", "b", "d"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]}
-            ).set_index(["idx0", "idx1"]),
-            pd.DataFrame(
-                {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]}
-            ).set_index(["idx0", "idx1"]),
-            2 / 3,
-            2 / 3,
-        ),
-        (
-            pd.DataFrame(
-                {
-                    "value": ["a", "b", "c", "d"],
-                    "idx0": ["1", "2", "3", "4"],
-                    "idx1": [4, 2, 1, 5],
-                }
-            ).set_index(["idx0", "idx1"]),
-            pd.DataFrame(
-                {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]}
-            ).set_index(["idx0", "idx1"]),
-            3 / 4,
-            1,
-        ),
-        (
-            pd.DataFrame(
-                {"value": ["c", "b", "a"], "idx0": ["3", "2", "1"], "idx1": [1, 2, 4]}
-            ).set_index(["idx0", "idx1"]),
-            pd.DataFrame(
-                {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]}
-            ).set_index(["idx0", "idx1"]),
-            1,
-            1,
-        ),
-    ],
-)
-def test_compute_validation_metrics(
-    computed_set, validation_set, expected_precision, expected_recall
-):
-    """Test validation metrics with test sets."""
-    metrics = compute_validation_metrics(computed_set, validation_set, "value")
-
-    assert metrics["precision"] == expected_precision
-    assert metrics["recall"] == expected_recall

From 4f50a7ba1f2dc69ebecfa416fb720a86607966ed Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 30 Aug 2024 10:02:15 -0400
Subject: [PATCH 016/161] Add validation pipeline

---
 .../experiment_tracking/mlflow_io_managers.py |  4 +-
 .../experiment_tracking/mlflow_resource.py    |  7 ++-
 .../library/experiment_tracking/validation.py | 57 ++++++++++++-------
 .../library/models/pipelines.py               |  7 ++-
 src/mozilla_sec_eia/models/sec10k/extract.py  | 53 ++++++++++++++---
 .../basic_10k_labels.csv                      |  0
 .../{ => validation_data}/ex21_labels.csv     |  0
 tests/unit/models/sec10k/extract_test.py      | 18 +-----
 8 files changed, 96 insertions(+), 50 deletions(-)
 rename src/mozilla_sec_eia/package_data/{ => validation_data}/basic_10k_labels.csv (100%)
 rename src/mozilla_sec_eia/package_data/{ => validation_data}/ex21_labels.csv (100%)

diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
index bc42138..3ad69ab 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
@@ -95,11 +95,11 @@ def load_input(self, context: InputContext) -> pd.DataFrame:
 
         if self.file_type == "csv":
             df = self._load_artifact_as_csv(
-                mlflow_run, artifact_name=f"{context.name}.csv"
+                mlflow_run, artifact_name=f"{context.upstream_output.name}.csv"
             )
         else:
             df = self._load_artifact_as_parquet(
-                mlflow_run, artifact_name=f"{context.name}.parquet"
+                mlflow_run, artifact_name=f"{context.upstream_output.name}.parquet"
             )
 
         return df
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
index 744c013..38e750f 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
@@ -78,7 +78,9 @@ def yield_for_execution(
 
             if (active_run := mlflow.active_run()) is not None:
                 if active_run.info.run_id != mlflow_run_id:
-                    raise RuntimeError("Found conflicting active mlflow run!")
+                    raise RuntimeError(
+                        f"Found conflicting active mlflow run! - {active_run.info.run_id} != {mlflow_run_id}"
+                    )
                 yield self
             else:
                 # Create new run under specified experiment
@@ -154,9 +156,8 @@ def get_or_create_experiment(
 
 def experiment_tracker_teardown_factory(
     experiment_name: str,
-) -> ExperimentTracker:
+):
     """Use config to create an experiment tracker."""
-    atexit.unregister(mlflow.end_run)
 
     @op(
         name=f"{experiment_name}_tracker_teardown",
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py
index c8aebe1..bed3a17 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/validation.py
+++ b/src/mozilla_sec_eia/library/experiment_tracking/validation.py
@@ -1,13 +1,46 @@
 """Implement common utilities/functions for validating models."""
 
+from importlib import resources
+
 import pandas as pd
-from dagster import OpDefinition, Out, op
+from dagster import Config, Out, op
+
+
+class LoadValidationConfig(Config):
+    """Configuration for loading validation data."""
+
+    filename: str
+
+
+@op(
+    required_resource_keys=["experiment_tracker"],
+    out={"validation_set": Out(io_manager_key="mlflow_pandas_artifact_io_manager")},
+)
+def load_validation_data(config: LoadValidationConfig) -> pd.DataFrame:
+    """Load csv with validation data from `package_data` directory."""
+    return pd.read_csv(
+        resources.files("mozilla_sec_eia.package_data.validation_data")
+        / config.filename
+    )
+
 
+class PandasPrecisionRecallConfig(Config):
+    """Configuration for computing precision/recall from pandas dataframe."""
 
-def _pandas_compute_precision_recall(
+    value_col: str
+
+
+@op(
+    out={
+        "pandas_precision_recall_metrics": Out(
+            io_manager_key="mlflow_metrics_io_manager"
+        )
+    }
+)
+def pandas_compute_precision_recall(
+    config: PandasPrecisionRecallConfig,
     computed_set: pd.DataFrame,
     validation_set: pd.DataFrame,
-    value_col: str,
 ) -> dict:
     """Compute precision and recall for extraction compared to validation set.
 
@@ -24,14 +57,14 @@ def _pandas_compute_precision_recall(
     idx_validation_only = validation_set.index.difference(computed_set.index)
     padded_compute_set = pd.concat(
         [
-            computed_set[value_col],
+            computed_set[config.value_col],
             pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
         ]
     ).sort_index()
     idx_compute_only = computed_set.index.difference(validation_set.index)
     padded_validation_set = pd.concat(
         [
-            validation_set[value_col],
+            validation_set[config.value_col],
             pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
         ]
     ).sort_index()
@@ -42,17 +75,3 @@ def _pandas_compute_precision_recall(
         "precision": true_positives / computed_len,
         "recall": true_positives / validation_len,
     }
-
-
-def pandas_precision_recall_op_factory(value_col: str) -> OpDefinition:
-    """Return an op that will compute precision/recall on `value_col` of dataframe."""
-
-    @op(
-        out={
-            "precision_recall_metrics": Out(io_manager_key="mlflow_metrics_io_manager")
-        }
-    )
-    def _precision_recall_op(computed_set: pd.DataFrame, validation_set: pd.DataFrame):
-        return _pandas_compute_precision_recall(computed_set, validation_set, value_col)
-
-    return _precision_recall_op
diff --git a/src/mozilla_sec_eia/library/models/pipelines.py b/src/mozilla_sec_eia/library/models/pipelines.py
index 48a0a8f..cfecc0d 100644
--- a/src/mozilla_sec_eia/library/models/pipelines.py
+++ b/src/mozilla_sec_eia/library/models/pipelines.py
@@ -30,6 +30,7 @@
     ResourceDefinition,
     RunConfig,
     failure_hook,
+    graph,
     job,
     success_hook,
 )
@@ -112,8 +113,10 @@ def pudl_pipeline(
 ) -> JobDefinition:
     """Decorator for an ML model that will handle providing configuration to dagster."""
 
-    def _decorator(model_graph: GraphDefinition):
-        model_config = get_default_config(model_graph) | config.op_config
+    def _decorator(pipeline_func):
+        model_graph = graph(pipeline_func)
+        model_config = get_default_config(model_graph)
+        model_config[model_graph.name]["ops"] |= config.op_config
 
         # Add resources to resource dict
         experiment_tracker = ExperimentTracker(
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 676ec38..21438c9 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -20,6 +20,7 @@
     op,
 )
 
+from mozilla_sec_eia.library.experiment_tracking import validation
 from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline
 
 from . import basic_10k
@@ -42,10 +43,10 @@ class ExtractionMetadataSchema(pa.DataFrameModel):
 
 @op
 def get_filing_metadata(
-    cloud_interface: GCSArchive,
+    cloud_interface: GCSArchive, filenames: list[str] | None = None
 ) -> pd.DataFrame:
     """Return filing metadata."""
-    return cloud_interface.get_metadata()
+    return cloud_interface.get_metadata(filenames=filenames)
 
 
 class ChunkFilingsConfig(Config):
@@ -150,8 +151,7 @@ def extract_graph_factory(
             "extraction_metrics": GraphOut(),
         },
     )
-    def extract_filings(previous_extraction_metadata, previous_extracted):
-        metadata = get_filing_metadata()
+    def extract_filings(metadata, previous_extraction_metadata, previous_extracted):
         filings_to_extract = get_filings_to_extract(
             metadata,
             previous_extraction_metadata,
@@ -254,8 +254,47 @@ def get_starting_data():
 @pudl_pipeline(
     basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource}
 )
-@graph
-def basic_10k_extraction_model():
+def basic_10k_extraction_pipeline():
     """Implement basic 10k extraction pudl_model."""
+    filing_metadata = get_filing_metadata()
     previous_extraction_metadata, previous_extracted = get_starting_data()
-    return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted)
+    return basic_10k_extract_graph(
+        filing_metadata, previous_extraction_metadata, previous_extracted
+    )
+
+
+@op
+def get_validation_filenames(validation_set: pd.DataFrame) -> list[str]:
+    """Return filenames in validation set."""
+    return list(validation_set["filename"])
+
+
+basic_10k_extract_validation_config = PudlPipelineConfig(
+    experiment_name="basic_10k_extraction_validation",
+    pandas_io_file_type="csv",
+    op_config={
+        "load_validation_data": validation.LoadValidationConfig(
+            filename="basic_10k_labels.csv"
+        ),
+        "pandas_compute_precision_recall": validation.PandasPrecisionRecallConfig(
+            value_col="value"
+        ),
+    },
+)
+
+
+@pudl_pipeline(
+    basic_10k_extract_validation_config,
+    resources={"cloud_interface": cloud_interface_resource},
+)
+def basic_10k_extraction_validation_pipeline():
+    """Job to validate basic 10k extraction."""
+    validation_set = validation.load_validation_data()
+    filing_metadata = get_filing_metadata(
+        filenames=get_validation_filenames(validation_set)
+    )
+    empty_metadata, empty_extracted = get_starting_data()
+    _, extracted, _ = basic_10k_extract_graph(
+        filing_metadata, empty_metadata, empty_extracted
+    )
+    return validation.pandas_compute_precision_recall(extracted, validation_set)
diff --git a/src/mozilla_sec_eia/package_data/basic_10k_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/basic_10k_labels.csv
similarity index 100%
rename from src/mozilla_sec_eia/package_data/basic_10k_labels.csv
rename to src/mozilla_sec_eia/package_data/validation_data/basic_10k_labels.csv
diff --git a/src/mozilla_sec_eia/package_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
similarity index 100%
rename from src/mozilla_sec_eia/package_data/ex21_labels.csv
rename to src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 7d7e716..0a12a17 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -13,7 +13,6 @@
     FilingsToExtractConfig,
     extract_graph_factory,
 )
-from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -73,23 +72,8 @@ def test_sec10k_extract_pipeline(
 ):
     """Test high level extraction workflow."""
 
-    class FakeArchive(GCSArchive):
-        filings_bucket_name: str = ""
-        labels_bucket_name: str = ""
-        metadata_db_instance_connection: str = ""
-        user: str = ""
-        metadata_db_name: str = ""
-        project: str = ""
-
-        def setup_for_execution(self, context):
-            pass
-
-        def get_metadata(self):
-            return filings_metadata
-
     @op(out={"extraction_metadata": Out(), "extracted": Out()})
     def test_extract(
-        cloud_interface: GCSArchive,
         filings_to_extract: pd.DataFrame,
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
         md = filings_to_extract
@@ -104,7 +88,6 @@ def test_extract(
     test_graph = extract_graph_factory("test_extract", test_extract)
     resources = {
         "experiment_tracker": test_tracker,
-        "cloud_interface": FakeArchive(),
         "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
             experiment_tracker=test_tracker
         ),
@@ -118,6 +101,7 @@ def test_extract(
             {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)}
         ),
         input_values={
+            "metadata": filings_metadata,
             "previous_extraction_metadata": previous_extraction_metadata,
             "previous_extracted": pd.DataFrame(),
         },

From f6ab22cfe2884e4899fd74a2ed8d8c138cb4211f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 2 Sep 2024 13:00:45 -0400
Subject: [PATCH 017/161] Streamline construction of dagster jobs for
 running/testing pudl models

---
 src/mozilla_sec_eia/library/__init__.py       |   7 -
 .../library/experiment_tracking/validation.py |  77 -----
 .../__init__.py                               |  17 +-
 .../mlflow_io_managers.py                     |  35 +-
 .../mlflow_resource.py                        | 110 ++-----
 .../library/mlflow/validation.py              |  88 +++++
 .../library/models/__init__.py                |   3 -
 .../library/models/pipelines.py               | 160 ---------
 src/mozilla_sec_eia/library/pipeline.py       |  93 ++++++
 .../models/sec10k/basic_10k.py                | 167 +++++-----
 src/mozilla_sec_eia/models/sec10k/extract.py  | 304 +-----------------
 src/mozilla_sec_eia/models/sec10k/pipeline.py | 168 ++++++++++
 .../models/sec10k/utils/cloud.py              |  14 +-
 src/mozilla_sec_eia/pudl_pipelines.py         |  26 +-
 .../pudl_validation_pipelines.py              |  32 ++
 tests/conftest.py                             |   6 +-
 tests/unit/models/sec10k/extract_test.py      | 168 ++++------
 17 files changed, 615 insertions(+), 860 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/library/experiment_tracking/validation.py
 rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/__init__.py (58%)
 rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/mlflow_io_managers.py (71%)
 rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/mlflow_resource.py (53%)
 create mode 100644 src/mozilla_sec_eia/library/mlflow/validation.py
 delete mode 100644 src/mozilla_sec_eia/library/models/__init__.py
 delete mode 100644 src/mozilla_sec_eia/library/models/pipelines.py
 create mode 100644 src/mozilla_sec_eia/library/pipeline.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/pipeline.py
 create mode 100644 src/mozilla_sec_eia/pudl_validation_pipelines.py

diff --git a/src/mozilla_sec_eia/library/__init__.py b/src/mozilla_sec_eia/library/__init__.py
index 3bd8694..3fef8c0 100644
--- a/src/mozilla_sec_eia/library/__init__.py
+++ b/src/mozilla_sec_eia/library/__init__.py
@@ -1,8 +1 @@
 """Implements shared tooling for machine learning models in PUDL."""
-
-from . import models
-
-
-def get_ml_pipeline_jobs() -> list[str]:
-    """Return all jobs created through `pudl_model` decorator."""
-    return list(models.PUDL_PIPELINES.values())
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py
deleted file mode 100644
index bed3a17..0000000
--- a/src/mozilla_sec_eia/library/experiment_tracking/validation.py
+++ /dev/null
@@ -1,77 +0,0 @@
-"""Implement common utilities/functions for validating models."""
-
-from importlib import resources
-
-import pandas as pd
-from dagster import Config, Out, op
-
-
-class LoadValidationConfig(Config):
-    """Configuration for loading validation data."""
-
-    filename: str
-
-
-@op(
-    required_resource_keys=["experiment_tracker"],
-    out={"validation_set": Out(io_manager_key="mlflow_pandas_artifact_io_manager")},
-)
-def load_validation_data(config: LoadValidationConfig) -> pd.DataFrame:
-    """Load csv with validation data from `package_data` directory."""
-    return pd.read_csv(
-        resources.files("mozilla_sec_eia.package_data.validation_data")
-        / config.filename
-    )
-
-
-class PandasPrecisionRecallConfig(Config):
-    """Configuration for computing precision/recall from pandas dataframe."""
-
-    value_col: str
-
-
-@op(
-    out={
-        "pandas_precision_recall_metrics": Out(
-            io_manager_key="mlflow_metrics_io_manager"
-        )
-    }
-)
-def pandas_compute_precision_recall(
-    config: PandasPrecisionRecallConfig,
-    computed_set: pd.DataFrame,
-    validation_set: pd.DataFrame,
-) -> dict:
-    """Compute precision and recall for extraction compared to validation set.
-
-    Arg:
-        computed_set: Extracted data.
-        validation_set: Expected extraction results.
-        value_col: Column to compare when computing metrics.
-    """
-    # Get initial length of both sets
-    computed_len = len(computed_set)
-    validation_len = len(validation_set)
-
-    # Get index of rows only in one set and make Null in other set
-    idx_validation_only = validation_set.index.difference(computed_set.index)
-    padded_compute_set = pd.concat(
-        [
-            computed_set[config.value_col],
-            pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
-        ]
-    ).sort_index()
-    idx_compute_only = computed_set.index.difference(validation_set.index)
-    padded_validation_set = pd.concat(
-        [
-            validation_set[config.value_col],
-            pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
-        ]
-    ).sort_index()
-
-    true_positives = (padded_compute_set == padded_validation_set).sum()
-
-    return {
-        "precision": true_positives / computed_len,
-        "recall": true_positives / validation_len,
-    }
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
similarity index 58%
rename from src/mozilla_sec_eia/library/experiment_tracking/__init__.py
rename to src/mozilla_sec_eia/library/mlflow/__init__.py
index fe6a070..380a63c 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -6,30 +6,25 @@
     MlflowPandasArtifactIOManager,
 )
 from .mlflow_resource import (
-    ExperimentTracker,
-    experiment_tracker_teardown_factory,
+    MlflowInterface,
     get_most_recent_run,
 )
 
 
 def get_mlflow_io_manager(
-    key: str, experiment_tracker: ExperimentTracker, pandas_file_type: str = "parquet"
+    key: str,
+    mlflow_interface: MlflowInterface | None = None,
+    pandas_file_type: str = "parquet",
 ) -> MlflowBaseIOManager:
     """Construct IO-manager based on key."""
     if key == "mlflow_pandas_artifact_io_manager":
         io_manager = MlflowPandasArtifactIOManager(
             file_type=pandas_file_type,
-            experiment_tracker=experiment_tracker,
-        )
-    elif key == "previous_run_mlflow_pandas_artifact_io_manager":
-        io_manager = MlflowPandasArtifactIOManager(
-            file_type=pandas_file_type,
-            experiment_tracker=experiment_tracker,
-            use_previous_mlflow_run=True,
+            mlflow_interface=mlflow_interface,
         )
     elif key == "mlflow_metrics_io_manager":
         io_manager = MlflowMetricsIOManager(
-            experiment_tracker=experiment_tracker,
+            mlflow_interface=mlflow_interface,
         )
     else:
         raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.")
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
similarity index 71%
rename from src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
rename to src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 3ad69ab..e78f627 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -11,7 +11,7 @@
 from dagster import ConfigurableIOManager, InputContext, OutputContext
 from mlflow.entities import Run
 
-from .mlflow_resource import ExperimentTracker
+from .mlflow_resource import MlflowInterface
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -19,28 +19,16 @@
 class MlflowBaseIOManager(ConfigurableIOManager):
     """Specify base config and implement helper functions for mlflow io-managers."""
 
-    experiment_tracker: ExperimentTracker
+    mlflow_interface: MlflowInterface
     #: By default handles artifacts from current run, but can be used with previous run.
-    use_previous_mlflow_run: bool = False
 
     def _get_run_info(self) -> Run:
-        """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run."""
-        dagster_run_id = self.experiment_tracker.get_run_id()
-        filter_string = f"tags.dagster_run_id='{dagster_run_id}'"
-        if self.use_previous_mlflow_run:
-            filter_string = f"tags.dagster_run_id!='{dagster_run_id}'"
-
-        run_metadata = mlflow.search_runs(
-            experiment_names=[self.experiment_tracker.experiment_name],
-            filter_string=filter_string,
-        )
-
-        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
-        return mlflow.get_run(run_metadata.loc[0, "run_id"])
+        """Get mlflow `Run` object using current run id."""
+        return mlflow.get_run(self.mlflow_interface.mlflow_run_id)
 
 
 class MlflowPandasArtifactIOManager(MlflowBaseIOManager):
-    """Implement IO manager for logging/loading parquet files as mlflow artifacts."""
+    """Implement IO manager for logging/loading dataframes as mlflow artifacts."""
 
     file_type: Literal["parquet", "csv"] = "parquet"
 
@@ -79,11 +67,6 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str:
 
     def handle_output(self, context: OutputContext, df: pd.DataFrame):
         """Attach dataframe to run as artifact."""
-        if self.use_previous_mlflow_run:
-            raise NotImplementedError(
-                "MlflowPandasArtifactIOManager can not be used to add artifacts to completed run."
-            )
-
         if self.file_type == "csv":
             self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv")
         else:
@@ -106,15 +89,13 @@ def load_input(self, context: InputContext) -> pd.DataFrame:
 
 
 class MlflowMetricsIOManager(MlflowBaseIOManager):
-    """Log/load models from mlflow tracking server."""
-
-    experiment_tracker: ExperimentTracker
+    """Log/load metrics from mlflow tracking server."""
 
     def handle_output(self, context: OutputContext, obj: dict[str, float]):
-        """Log metrics to mlflow run/experiment from `experiment_tracker`."""
+        """Load metrics to mlflow run/experiment created by `MlflowInterface`."""
         mlflow.log_metrics(obj)
 
     def load_input(self, context: OutputContext) -> dict[str, float]:
-        """Log metrics to mlflow run/experiment from `experiment_tracker`."""
+        """Log metrics to mlflow run/experiment created by `MlflowInterface`."""
         run = self._get_run_info()
         return run.data.metrics
diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
similarity index 53%
rename from src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
rename to src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
index 38e750f..35dbf26 100644
--- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
@@ -9,87 +9,71 @@
 this is a configurable value, which can be found in the dagster UI.
 """
 
-import atexit
 import logging
 import os
 from contextlib import contextmanager
 
 import mlflow
-from dagster import ConfigurableResource, In, InitResourceContext, Nothing, op
+from dagster import ConfigurableResource, EnvVar, InitResourceContext
 from google.cloud import secretmanager
 from pydantic import PrivateAttr
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-class ExperimentTracker(ConfigurableResource):
-    """Class to manage tracking a machine learning model using MLflow.
+class MlflowInterface(ConfigurableResource):
+    """Dagster resource to interface with mlflow tracking server.
 
-    The following command will launch the mlflow UI to view model results:
-    `mlflow ui --backend-store-uri {tracking_uri}`. From here, you can compare metrics
-    from multiple runs, and track performance.
+    This resource handles configuring mlflow to interface with a remote tracking server.
+    When `tracking_enabled` is set to True, this resource will also start an mlflow
+    run that can be used to log metrics/paramaters/artifcats which will be associated
+    with a validation or training run. In most cases this resource does not need to
+    be referenced directly, and instead the io-mangers defined in
+    :mod:`.mlflow_io_managers` should be used.
 
-    This class is designed to be created using the `op` :func:`create_experiment_tracker`.
-    This allows the `ExperimentTracker` to be passed around within a Dagster `graph`,
-    and be used for mlflow logging in any of the `op`'s that make up the `graph`. This
-    is useful because Dagster executes `op`'s in separate processes, while mlflow does
-    not maintain state between processes. This design also allows configuration of
-    the ExperimentTracker to be set from the Dagster UI.
-
-    Currently, we are only doing experiment tracking in a local context, but if we were
-    to setup a tracking server, we could point the `tracking_uri` at this remote server
-    without having to modify the models. Experiment tracking can also be done outside
-    of the PUDL context. If doing exploratory work in a notebook, you can use mlflow
-    directly in a notebook with the same experiment name used here, and mlflow will
-    seamlessly integrate the results with those from PUDL runs.
+    Note: `tracking_enabled` SHOULD NOT be set when using a dagster multi-process
+    executor. mlflow will create a new run for every process, which gets very messy.
     """
 
-    tracking_uri: str
+    tracking_uri: str = EnvVar("MLFLOW_TRACKING_URI")
     tracking_enabled: bool = True
     artifact_location: str | None = None
     experiment_name: str
     tags: dict = {}
-    project: str
+    project: str = EnvVar("GCS_PROJECT")
 
-    _run_id: str = PrivateAttr()
+    _mlflow_run_id: str = PrivateAttr()
 
     @contextmanager
     def yield_for_execution(
         self,
         context: InitResourceContext,
-    ) -> "ExperimentTracker":
+    ) -> "MlflowInterface":
         """Create experiment tracker for specified experiment."""
-        self._run_id = context.run_id
+        dagster_run_id = context.run_id
+        self._mlflow_run_id = None
+        self._configure_mlflow()
 
         if self.tracking_enabled:
-            self._configure_mlflow()
-
-            # Hack to stop mlflow from ending run at process barrier
-            # This is borrowed from the official dagster mlflow resource found here:
-            # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py
-            atexit.unregister(mlflow.end_run)
-
             # Get run_id associated with current dagster run
             experiment_id = self.get_or_create_experiment(
                 experiment_name=self.experiment_name,
                 artifact_location=self.artifact_location,
             )
-            mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id)
-
-            if (active_run := mlflow.active_run()) is not None:
-                if active_run.info.run_id != mlflow_run_id:
-                    raise RuntimeError(
-                        f"Found conflicting active mlflow run! - {active_run.info.run_id} != {mlflow_run_id}"
-                    )
+            # Create new run under specified experiment
+            with mlflow.start_run(
+                experiment_id=experiment_id,
+                tags=self.tags | {"dagster_run_id": dagster_run_id},
+            ) as run:
+                self._mlflow_run_id = run.info.run_id
                 yield self
-            else:
-                # Create new run under specified experiment
-                with mlflow.start_run(
-                    run_id=mlflow_run_id,
-                    experiment_id=experiment_id,
-                    tags=self.tags | {"dagster_run_id": context.run_id},
-                ):
-                    yield self
+        else:
+            yield self
+
+    @property
+    def mlflow_run_id(self) -> str | None:
+        """Return run id of current run."""
+        return self._mlflow_run_id
 
     def _get_tracking_password(self, version_id: str = "latest"):
         """Get tracking server password from gcloud secrets."""
@@ -115,22 +99,6 @@ def _configure_mlflow(self):
         os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
         os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
 
-    def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str):
-        """Search for existing run tagged with dagster run id or start new run."""
-        run_df = mlflow.search_runs(
-            experiment_ids=[experiment_id],
-            filter_string=f"tags.dagster_run_id='{dagster_run_id}'",
-        )
-
-        run_id = None
-        if not run_df.empty:
-            run_id = run_df.loc[0, "run_id"]
-        return run_id
-
-    def get_run_id(self):
-        """Return current dagster run_id."""
-        return self._run_id
-
     @staticmethod
     def get_or_create_experiment(
         experiment_name: str, artifact_location: str = ""
@@ -154,22 +122,6 @@ def get_or_create_experiment(
         return experiment_id
 
 
-def experiment_tracker_teardown_factory(
-    experiment_name: str,
-):
-    """Use config to create an experiment tracker."""
-
-    @op(
-        name=f"{experiment_name}_tracker_teardown",
-        required_resource_keys=["experiment_tracker"],
-        ins={"model_done": In(Nothing)},
-    )
-    def teardown_experiment_tracker():
-        mlflow.end_run()
-
-    return teardown_experiment_tracker
-
-
 def get_most_recent_run(
     experiment_name: str, dagster_run_id: str
 ) -> mlflow.entities.Run:
diff --git a/src/mozilla_sec_eia/library/mlflow/validation.py b/src/mozilla_sec_eia/library/mlflow/validation.py
new file mode 100644
index 0000000..999cbdd
--- /dev/null
+++ b/src/mozilla_sec_eia/library/mlflow/validation.py
@@ -0,0 +1,88 @@
+"""Implement common utilities/functions for validating models."""
+
+from importlib import resources
+
+import pandas as pd
+from dagster import AssetIn, AssetsDefinition, asset
+
+
+def load_validation_data_asset_factory(
+    asset_name: str,
+    filename: str,
+    index_cols: str | list[str] | None = None,
+) -> AssetsDefinition:
+    """Construct asset for loading validation data from CSV in `package_data`."""
+
+    @asset(
+        name=asset_name,
+        io_manager_key="mlflow_pandas_artifact_io_manager",
+    )
+    def load_validation_data() -> pd.DataFrame:
+        """Load csv with validation data from `package_data` directory."""
+        df = pd.read_csv(
+            resources.files("mozilla_sec_eia.package_data.validation_data") / filename
+        )
+        if index_cols is not None:
+            df = df.set_index(index_cols)
+        return df
+
+    return load_validation_data
+
+
+def pandas_precision_recall_asset_factory(
+    validation_asset: str,
+    computed_asset: str,
+    value_col: str,
+) -> AssetsDefinition:
+    """Produce asset to compute precision and recall on pandas dataframe.
+
+    The returned asset will take upstream computed/validation assets and compute
+    precision/recall on `value_col`.
+
+    Arg:
+        validation_asset: Upstream asset containing dataframe of validation set.
+        computed_asset: Upstream asset containing dataframe of computed data.
+        value_col: Column to compare when computing metrics.
+    """
+
+    @asset(
+        ins={
+            "computed_set": AssetIn(computed_asset),
+            "validation_set": AssetIn(validation_asset),
+        },
+        io_manager_key="mlflow_metrics_io_manager",
+    )
+    def pandas_compute_precision_recall(
+        computed_set: pd.DataFrame,
+        validation_set: pd.DataFrame,
+    ) -> dict:
+        """Asset which will return computed metrics from dataframes."""
+        # Get initial length of both sets
+        computed_len = len(computed_set)
+        validation_len = len(validation_set)
+
+        # Get index of rows only in one set and make Null in other set
+        idx_validation_only = validation_set.index.difference(computed_set.index)
+        padded_compute_set = pd.concat(
+            [
+                computed_set[value_col],
+                pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
+            ]
+        ).sort_index()
+        idx_compute_only = computed_set.index.difference(validation_set.index)
+        padded_validation_set = pd.concat(
+            [
+                validation_set[value_col],
+                pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
+            ]
+        ).sort_index()
+
+        true_positives = (padded_compute_set == padded_validation_set).sum()
+
+        return {
+            "precision": true_positives / computed_len,
+            "recall": true_positives / validation_len,
+        }
+
+    # Return new asset
+    return pandas_compute_precision_recall
diff --git a/src/mozilla_sec_eia/library/models/__init__.py b/src/mozilla_sec_eia/library/models/__init__.py
deleted file mode 100644
index 3deb3bb..0000000
--- a/src/mozilla_sec_eia/library/models/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Implement top level framework and utilities for defining pudl models/pipelines."""
-
-from .pipelines import PUDL_PIPELINES, PudlPipelineConfig, pudl_pipeline
diff --git a/src/mozilla_sec_eia/library/models/pipelines.py b/src/mozilla_sec_eia/library/models/pipelines.py
deleted file mode 100644
index cfecc0d..0000000
--- a/src/mozilla_sec_eia/library/models/pipelines.py
+++ /dev/null
@@ -1,160 +0,0 @@
-"""Provides tooling for developing/tracking ml models within PUDL.
-
-The main interface from this module is the :func:`pudl_model` decorator, which
-is meant to be applied to a dagster `graph`. This decorator will handle finding all
-configuration for a model/passing configuration to dagster, creating an
-:class:`ExperimentTracker` for the model, and ultimately will return a `job`
-from the model.
-
-There are a few different ways to provide configuration for a PUDL model. First, configuration will come from default values for any dagster `Config`'s which are associated
-with `op`'s which make up the model `graph`. For more info on dagster configuration,
-see https://docs.dagster.io/concepts/configuration/config-schema. The next way to
-provide configuration is through the yaml file: `pudl.package_data.settings.pudl_models.yml`.
-Any configuration in this file should be follow dagster's config-schema formatting,
-see the `ferc_to_ferc` entry as an example. Configuration provided this way will
-override any default values. The final way to provide configuration is through the
-dagster UI. To provide configuration this way, click `Open Launchpad` in the UI, and
-values can be edited here. This configuration will override both default values and
-yaml configuration, but will only be used for a single run.
-"""
-
-import logging
-
-import mlflow
-from dagster import (
-    EnvVar,
-    GraphDefinition,
-    HookContext,
-    JobDefinition,
-    OpDefinition,
-    ResourceDefinition,
-    RunConfig,
-    failure_hook,
-    graph,
-    job,
-    success_hook,
-)
-from mlflow.entities.run_status import RunStatus
-from pydantic import BaseModel
-
-from ..experiment_tracking import (
-    ExperimentTracker,
-    experiment_tracker_teardown_factory,
-    get_mlflow_io_manager,
-)
-
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-PUDL_PIPELINES = {}
-
-
-def get_default_config(model_graph: GraphDefinition) -> dict:
-    """Get default config values for model."""
-
-    def _get_default_from_ops(node: OpDefinition | GraphDefinition):
-        config = {}
-        if isinstance(node, GraphDefinition):
-            config = {
-                "ops": {
-                    child_node.name: _get_default_from_ops(child_node)
-                    for child_node in node.node_defs
-                }
-            }
-        else:
-            if node.config_schema.default_provided:
-                config = {"config": node.config_schema.default_value}
-            else:
-                config = {"config": None}
-
-        return config
-
-    config = {model_graph.name: _get_default_from_ops(model_graph)}
-    return config
-
-
-def get_pudl_pipeline_job_name(experiment_name: str) -> str:
-    """Return expected pudl model job name based on experiment_name."""
-    return f"{experiment_name}_job"
-
-
-@success_hook(required_resource_keys={"experiment_tracker"})
-def _log_config_hook(context: HookContext):
-    if (config := context.op_config) is not None:
-        mlflow.log_params(
-            {f"{context.op.name}.{param}": value for param, value in config.items()}
-        )
-
-
-@failure_hook(required_resource_keys={"experiment_tracker"})
-def _end_mlflow_run_with_failure(context: HookContext):
-    exception = context.op_exception
-
-    if isinstance(exception, KeyboardInterrupt):
-        mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
-    else:
-        mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
-
-
-class PudlPipelineConfig(BaseModel):
-    """Define a config format for `pudl_pipeline`'s."""
-
-    experiment_name: str
-    op_config: dict = {}
-    required_mlflow_io_managers: list[str] = [
-        "mlflow_pandas_artifact_io_manager",
-        "previous_run_mlflow_pandas_artifact_io_manager",
-        "mlflow_metrics_io_manager",
-    ]
-    pandas_io_file_type: str = "parquet"
-
-
-def pudl_pipeline(
-    config: PudlPipelineConfig,
-    resources: dict[str, ResourceDefinition] = {},
-) -> JobDefinition:
-    """Decorator for an ML model that will handle providing configuration to dagster."""
-
-    def _decorator(pipeline_func):
-        model_graph = graph(pipeline_func)
-        model_config = get_default_config(model_graph)
-        model_config[model_graph.name]["ops"] |= config.op_config
-
-        # Add resources to resource dict
-        experiment_tracker = ExperimentTracker(
-            experiment_name=config.experiment_name,
-            tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-            project=EnvVar("GCS_PROJECT"),
-        )
-        model_resources = (
-            {"experiment_tracker": experiment_tracker}
-            | {
-                key: get_mlflow_io_manager(
-                    key, experiment_tracker, config.pandas_io_file_type
-                )
-                for key in config.required_mlflow_io_managers
-            }
-            | resources
-        )
-
-        default_config = RunConfig(
-            ops=model_config,
-        )
-
-        @job(
-            name=get_pudl_pipeline_job_name(config.experiment_name),
-            config=default_config,
-            hooks={_log_config_hook, _end_mlflow_run_with_failure},
-            resource_defs=model_resources,
-        )
-        def model_job(**kwargs):
-            tracker_teardown = experiment_tracker_teardown_factory(
-                experiment_name=model_graph.name,
-            )
-            graph_output = model_graph(**kwargs)
-
-            # Pass output to teardown to create a dependency
-            tracker_teardown(graph_output)
-
-        PUDL_PIPELINES[get_pudl_pipeline_job_name(config.experiment_name)] = model_job
-        return model_job
-
-    return _decorator
diff --git a/src/mozilla_sec_eia/library/pipeline.py b/src/mozilla_sec_eia/library/pipeline.py
new file mode 100644
index 0000000..e35f1a9
--- /dev/null
+++ b/src/mozilla_sec_eia/library/pipeline.py
@@ -0,0 +1,93 @@
+"""Implement helper methods for constructing dagster jobs.
+
+Methods defined here are the main interface for constructing PUDL model jobs.
+`create_production_pipeline` will produce a dagster job that will use the default
+multi-process executor to run a PUDL model. `create_validation_pipeline` is meant for
+testing/validating models with an mlflow run backing the dagster run for logging.
+"""
+
+import mlflow
+from dagster import (
+    AssetsDefinition,
+    HookContext,
+    ResourceDefinition,
+    define_asset_job,
+    failure_hook,
+    in_process_executor,
+    success_hook,
+)
+from mlflow.entities import RunStatus
+
+PUDL_PIPELINE_PRODUCTION_JOBS = []
+PUDL_PIPELINE_PRODUCTION_ASSETS = []
+PUDL_PIPELINE_PRODUCTION_RESOURCES = {}
+
+PUDL_PIPELINE_VALIDATION_JOBS = []
+PUDL_PIPELINE_VALIDATION_ASSETS = []
+PUDL_PIPELINE_VALIDATION_RESOURCES = {}
+
+
+def create_production_pipeline(
+    pipeline_name: str,
+    assets: list[AssetsDefinition],
+    resources: dict[str, ResourceDefinition],
+    **kwargs,
+):
+    """Construct a dagster job and supply Definitions with assets and resources."""
+    PUDL_PIPELINE_PRODUCTION_JOBS.append(
+        define_asset_job(
+            pipeline_name,
+            selection=assets,
+            **kwargs,
+        )
+    )
+    PUDL_PIPELINE_PRODUCTION_ASSETS.extend(assets)
+    PUDL_PIPELINE_PRODUCTION_RESOURCES.update(resources)
+
+
+@success_hook(required_resource_keys={"mlflow_interface"})
+def log_op_config(context: HookContext):
+    """Log any config supplied to ops/assets in validation job to mlflow tracking server."""
+    if context.op_config is not None:
+        mlflow.log_params(context.op_config)
+
+
+@failure_hook(required_resource_keys={"mlflow_interface"})
+def end_run_on_failure(context: HookContext):
+    """Inform mlflow about job failure."""
+    if isinstance(context.op_exception, KeyboardInterrupt):
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
+    else:
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
+
+
+def create_validation_pipeline(
+    pipeline_name: str,
+    assets: list[AssetsDefinition],
+    resources: dict[str, ResourceDefinition],
+    **kwargs,
+):
+    """Construct a dagster job and supply Definitions with assets and resources."""
+    PUDL_PIPELINE_VALIDATION_JOBS.append(
+        define_asset_job(
+            pipeline_name,
+            selection=assets,
+            executor_def=in_process_executor,
+            hooks={log_op_config, end_run_on_failure},
+            # Configure mlflow_interface for job with appropriate experiment name
+            config={
+                "ops": {},
+                "resources": {
+                    "mlflow_interface": {
+                        "config": {
+                            "experiment_name": pipeline_name,
+                            "tracking_enabled": True,
+                        }
+                    }
+                },
+            },
+            **kwargs,
+        )
+    )
+    PUDL_PIPELINE_VALIDATION_ASSETS.extend(assets)
+    PUDL_PIPELINE_VALIDATION_RESOURCES.update(resources)
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index e5b5f72..dfc2fc8 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -3,98 +3,97 @@
 import logging
 
 import pandas as pd
-from dagster import Out, op
 
-from .utils.cloud import GCSArchive, Sec10K
+from .extract import Sec10kExtractor
+from .utils.cloud import Sec10K
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
-EXPERIMENT_NAME = "basic_10k_extraction"
 
 
-def _extract_10k(filing: Sec10K):
-    """Extract basic company data from filing."""
-    logger.info(f"Extracting 10K company data from filing: {filing.filename}")
-    header = True
-    current_block = None
-    values = []
-    filer_count = 0
-    block_counts = {
-        "company data": 0,
-        "filing values": 0,
-        "business address": 0,
-        "mail address": 0,
-        "former company": 0,
-    }
-    unmatched_keys = []
-    for line in filing.filing_text.splitlines():
-        match line.replace("\t", "").lower().split(":"):
-            case ["filer", ""]:
-                filer_count += 1
-                header = False
-            case [
-                (
-                    "company data"
-                    | "filing values"
-                    | "business address"
-                    | "mail address"
-                    | "former company"
-                ) as block,
-                "",
-            ] if not header:
-                current_block = block
-                block_counts[current_block] += 1
-            case [key, ""] if current_block is not None:
-                key = f"{block}_{key}".replace(" ", "_")
-                logger.warning(f"No value found for {key} for filing {filing.filename}")
-                unmatched_keys.append(key)
-            case [key, value] if current_block is not None:
-                key = key.replace(" ", "_")
-                values.append(
-                    {
-                        "filename": filing.filename,
-                        "filer_count": filer_count - 1,
-                        "block": current_block.replace(" ", "_"),
-                        "block_count": block_counts[current_block] - 1,
-                        "key": key.replace(" ", "_"),
-                        "value": value,
-                    }
-                )
-            case ["</sec-header>" | "</ims-header>"]:
-                break
-            case _ if header:
-                continue
+class Basic10kExtractor(Sec10kExtractor):
+    """Implement Sec10kExtractor for basic 10k company info data."""
 
-    return pd.DataFrame(values), filing.filename, unmatched_keys
+    def _extract_10k(self, filing: Sec10K):
+        """Extract basic company data from filing."""
+        logger.info(f"Extracting 10K company data from filing: {filing.filename}")
+        header = True
+        current_block = None
+        values = []
+        filer_count = 0
+        block_counts = {
+            "company data": 0,
+            "filing values": 0,
+            "business address": 0,
+            "mail address": 0,
+            "former company": 0,
+        }
+        unmatched_keys = []
+        for line in filing.filing_text.splitlines():
+            match line.replace("\t", "").lower().split(":"):
+                case ["filer", ""]:
+                    filer_count += 1
+                    header = False
+                case [
+                    (
+                        "company data"
+                        | "filing values"
+                        | "business address"
+                        | "mail address"
+                        | "former company"
+                    ) as block,
+                    "",
+                ] if not header:
+                    current_block = block
+                    block_counts[current_block] += 1
+                case [key, ""] if current_block is not None:
+                    key = f"{block}_{key}".replace(" ", "_")
+                    logger.warning(
+                        f"No value found for {key} for filing {filing.filename}"
+                    )
+                    unmatched_keys.append(key)
+                case [key, value] if current_block is not None:
+                    key = key.replace(" ", "_")
+                    values.append(
+                        {
+                            "filename": filing.filename,
+                            "filer_count": filer_count - 1,
+                            "block": current_block.replace(" ", "_"),
+                            "block_count": block_counts[current_block] - 1,
+                            "key": key.replace(" ", "_"),
+                            "value": value,
+                        }
+                    )
+                case ["</sec-header>" | "</ims-header>"]:
+                    break
+                case _ if header:
+                    continue
 
+        return pd.DataFrame(values), filing.filename, unmatched_keys
 
-@op(out={"extraction_metadata": Out(), "extracted": Out()})
-def extract(
-    cloud_interface: GCSArchive,
-    filings_to_extract: pd.DataFrame,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Extract basic 10K data and write to postgres table.
+    def extract_filings(
+        self,
+        filings_to_extract: pd.DataFrame,
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Extract basic 10K data and return extracted data/metadata."""
+        logger.info("Starting basic 10K extraction.")
+        logger.info(f"Extracting {len(filings_to_extract)} filings.")
 
-    Args:
-        continue_run: If true, only extract filings not in DB, otherwise clobber
-            basic_10k table.
-    """
-    logger.info("Starting basic 10K extraction.")
-    logger.info(f"Extracting {len(filings_to_extract)} filings.")
+        extraction_metadata = pd.DataFrame(
+            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+        ).set_index("filename")
+        extracted = pd.DataFrame()
 
-    extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-    ).set_index("filename")
-    extracted = pd.DataFrame()
+        for filing in self.cloud_interface.iterate_filings(filings_to_extract):
+            ext, filename, unmatched_keys = self._extract_10k(filing)
+            extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
+                len(ext) > 0,
+                ",".join(unmatched_keys),
+            ]
+            extracted = pd.concat([extracted, ext])
 
-    for filing in cloud_interface.iterate_filings(filings_to_extract):
-        ext, filename, unmatched_keys = _extract_10k(filing)
-        extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
-            len(ext) > 0,
-            ",".join(unmatched_keys),
-        ]
-        extracted = pd.concat([extracted, ext])
-
-    return (
-        extraction_metadata,
-        extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]),
-    )
+        return (
+            extraction_metadata,
+            extracted.set_index(
+                ["filename", "filer_count", "block", "block_count", "key"]
+            ),
+        )
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 21438c9..ad3760d 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -1,300 +1,20 @@
-"""Implement top level extraction methods and tooling."""
+"""Implement base class for an SEC10k extractor."""
 
-import logging
-import math
-
-import numpy as np
 import pandas as pd
-import pandera as pa
-from dagster import (
-    Config,
-    DynamicOut,
-    DynamicOutput,
-    GraphDefinition,
-    GraphOut,
-    In,
-    OpDefinition,
-    Out,
-    Output,
-    graph,
-    op,
-)
-
-from mozilla_sec_eia.library.experiment_tracking import validation
-from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline
-
-from . import basic_10k
-from .utils.cloud import GCSArchive, cloud_interface_resource
-
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-
-DATASETS = ["ex21", "basic_10k"]
-
-
-class ExtractionMetadataSchema(pa.DataFrameModel):
-    """Define the required schema for extraction metadata.
-
-    Extra columns are permitted, but these are required for computing extraction metrics.
-    """
+from dagster import ConfigurableResource
 
-    filename: pa.typing.Index[str] = pa.Field(check_name=True)
-    success: bool = pa.Field(coerce=True)
-
-
-@op
-def get_filing_metadata(
-    cloud_interface: GCSArchive, filenames: list[str] | None = None
-) -> pd.DataFrame:
-    """Return filing metadata."""
-    return cloud_interface.get_metadata(filenames=filenames)
-
-
-class ChunkFilingsConfig(Config):
-    """Config how many filings are extracted and chunk_size for extraction."""
-
-    chunk_size: int = 1000
-
-
-@op(out=DynamicOut())
-def chunk_filings(
-    config: ChunkFilingsConfig,
-    filings_to_extract: pd.DataFrame,
-) -> pd.DataFrame:
-    """Split filings into chunks for parallel extraction."""
-    for i, chunk in enumerate(
-        np.array_split(
-            filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size)
-        )
-    ):
-        yield DynamicOutput(chunk, mapping_key=str(i))
+from .utils.cloud import GCSArchive
 
 
-class GetMostRecentRunResultsConfig(Config):
-    """Configuration specifying whether to get run results and continue."""
+class Sec10kExtractor(ConfigurableResource):
+    """Base class for extracting SEC 10k data."""
 
-    continue_run: bool = False
-
-
-@op(out={"basic_extraction_metrics": Out(io_manager_key="mlflow_metrics_io_manager")})
-def log_extraction_data(
-    metadata: pd.DataFrame,
-    extraction_metadata: pd.DataFrame,
-    extracted: pd.DataFrame,
-):
-    """Log results from extraction run."""
-    return {
-        "num_failed": (~extraction_metadata["success"]).sum(),
-        "ratio_extracted": len(extraction_metadata) / len(metadata),
-    }
-
-
-@op(
-    required_resource_keys=["experiment_tracker"],
-    out={
-        "extraction_metadata": Out(io_manager_key="mlflow_pandas_artifact_io_manager"),
-        "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"),
-    },
-)
-def merge_extracted_data(
-    extraction_metadata: list[pd.DataFrame],
-    extracted: list[pd.DataFrame],
-    previous_run_extraction_metadata: pd.DataFrame,
-    previous_run_extracted_data: pd.DataFrame,
-):
-    """Data is extracted in parallel ops, merge these plus any data from previous run."""
-    extraction_metadata = pd.concat(
-        extraction_metadata + [previous_run_extraction_metadata]
-    )
-    extracted = pd.concat(extracted + [previous_run_extracted_data])
-    # Use metadata to log generic metrics
-    extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
-
-    return extraction_metadata, extracted
-
-
-class FilingsToExtractConfig(Config):
-    """Define configuration for filtering filings to extract."""
-
-    num_filings: int = -1
-
-
-@op
-def get_filings_to_extract(
-    config: FilingsToExtractConfig,
-    filing_metadata: pd.DataFrame,
-    previous_extraction_metadata: pd.DataFrame,
-    previous_extracted: pd.DataFrame,
-):
-    """Filter out any previously extracted filings and sub-sample to `num_filings`."""
-    filings_to_extract = filing_metadata
-    if config.num_filings > 0:
-        filings_to_extract = filings_to_extract.sample(config.num_filings)
-
-    filings_to_extract = filings_to_extract[
-        ~filings_to_extract["filename"].isin(previous_extraction_metadata.index)
-    ]
-    return filings_to_extract
-
-
-def extract_graph_factory(
-    dataset_name: str,
-    extract_op: OpDefinition | GraphDefinition,
-):
-    """Produce a `pudl_model` to extract data from sec10k filings."""
-    experiment_name = f"{dataset_name}_extraction"
-
-    @graph(
-        name=experiment_name,
-        out={
-            "extraction_metadata": GraphOut(),
-            "extracted": GraphOut(),
-            "extraction_metrics": GraphOut(),
-        },
-    )
-    def extract_filings(metadata, previous_extraction_metadata, previous_extracted):
-        filings_to_extract = get_filings_to_extract(
-            metadata,
-            previous_extraction_metadata,
-            previous_extracted,
-        )
+    cloud_interface: GCSArchive
 
-        filing_chunks = chunk_filings(filings_to_extract)
-        extraction_metadata, extracted = filing_chunks.map(extract_op)
-        extraction_metadata, extracted = merge_extracted_data(
-            extraction_metadata.collect(),
-            extracted.collect(),
-            previous_extraction_metadata,
-            previous_extracted,
+    def extract_filings(
+        self, filing_metadata: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Method must be implemented by subclasses to extract SEC10k filings."""
+        raise NotImplementedError(
+            "extract_filings must be implemented by any subclass!"
         )
-
-        extraction_metrics = log_extraction_data(
-            metadata,
-            extraction_metadata,
-            extracted,
-        )
-        return extraction_metadata, extracted, extraction_metrics
-
-    return extract_filings
-
-
-@op(
-    ins={
-        "extraction_metadata": In(
-            input_manager_key="previous_run_mlflow_pandas_artifact_io_manager"
-        ),
-        "extracted": In(
-            input_manager_key="previous_run_mlflow_pandas_artifact_io_manager"
-        ),
-    }
-)
-def get_previous_run_data(
-    continue_previous_run, extraction_metadata: pd.DataFrame, extracted: pd.DataFrame
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Return previous run data loaded by io-manager."""
-    extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata)
-
-    return extraction_metadata, extracted
-
-
-@op
-def get_empty_run_data(start_new_run):
-    """Return empty dataframes representing run metadata and extracted data."""
-    extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-    ).set_index("filename")
-    extracted = pd.DataFrame()
-
-    return extraction_metadata, extracted
-
-
-class ContinuePreviousRunConfig(Config):
-    """Configure whether to continue a previous extraction run or not."""
-
-    continue_run: bool = False
-
-
-@op(out={"continue_run": Out(is_required=False), "new_run": Out(is_required=False)})
-def continue_previous_run(config: ContinuePreviousRunConfig):
-    """Create branch dictating whether a previous extraction run is continued or not."""
-    if config.continue_run:
-        yield Output(True, "continue_run")
-    else:
-        yield Output(True, "new_run")
-
-
-@op(out={"previous_run_extraction_metadata": Out(), "previous_extracted": Out()})
-def merge_branches(dfs: list[tuple[pd.DataFrame, pd.DataFrame]]):
-    """Merge branches created by `continue_previous_run` and return."""
-    dfs = dfs[0]
-    return dfs[0], dfs[1]
-
-
-@graph(
-    out={
-        "previous_run_extraction_metadata": GraphOut(),
-        "previous_extracted": GraphOut(),
-    }
-)
-def get_starting_data():
-    """Get previous run data if configured to do so."""
-    continue_run, new_run = continue_previous_run()
-    previous_data = get_previous_run_data(continue_run)
-    new_data = get_empty_run_data(new_run)
-    return merge_branches([previous_data, new_data])
-
-
-basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract)
-
-
-basic_10k_extract_config = PudlPipelineConfig(
-    experiment_name="basic_10k_extraction",
-)
-
-
-@pudl_pipeline(
-    basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource}
-)
-def basic_10k_extraction_pipeline():
-    """Implement basic 10k extraction pudl_model."""
-    filing_metadata = get_filing_metadata()
-    previous_extraction_metadata, previous_extracted = get_starting_data()
-    return basic_10k_extract_graph(
-        filing_metadata, previous_extraction_metadata, previous_extracted
-    )
-
-
-@op
-def get_validation_filenames(validation_set: pd.DataFrame) -> list[str]:
-    """Return filenames in validation set."""
-    return list(validation_set["filename"])
-
-
-basic_10k_extract_validation_config = PudlPipelineConfig(
-    experiment_name="basic_10k_extraction_validation",
-    pandas_io_file_type="csv",
-    op_config={
-        "load_validation_data": validation.LoadValidationConfig(
-            filename="basic_10k_labels.csv"
-        ),
-        "pandas_compute_precision_recall": validation.PandasPrecisionRecallConfig(
-            value_col="value"
-        ),
-    },
-)
-
-
-@pudl_pipeline(
-    basic_10k_extract_validation_config,
-    resources={"cloud_interface": cloud_interface_resource},
-)
-def basic_10k_extraction_validation_pipeline():
-    """Job to validate basic 10k extraction."""
-    validation_set = validation.load_validation_data()
-    filing_metadata = get_filing_metadata(
-        filenames=get_validation_filenames(validation_set)
-    )
-    empty_metadata, empty_extracted = get_starting_data()
-    _, extracted, _ = basic_10k_extract_graph(
-        filing_metadata, empty_metadata, empty_extracted
-    )
-    return validation.pandas_compute_precision_recall(extracted, validation_set)
diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py
new file mode 100644
index 0000000..e4b7d0d
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/pipeline.py
@@ -0,0 +1,168 @@
+"""Implement top level extraction methods and tooling."""
+
+import logging
+
+import pandas as pd
+import pandera as pa
+from dagster import (
+    AssetExecutionContext,
+    AssetIn,
+    AssetOut,
+    StaticPartitionsDefinition,
+    asset,
+    multi_asset,
+    with_resources,
+)
+
+from mozilla_sec_eia.library.mlflow import validation
+from mozilla_sec_eia.library.pipeline import (
+    create_production_pipeline,
+    create_validation_pipeline,
+)
+
+from .basic_10k import Basic10kExtractor
+from .extract import Sec10kExtractor
+from .utils.cloud import GCSArchive, cloud_interface_resource
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+DATASETS = ["ex21", "basic_10k"]
+
+
+class ExtractionMetadataSchema(pa.DataFrameModel):
+    """Define the required schema for extraction metadata.
+
+    Extra columns are permitted, but these are required for computing extraction metrics.
+    """
+
+    filename: pa.typing.Index[str] = pa.Field(check_name=True)
+    success: bool = pa.Field(coerce=True)
+
+
+# Create year_quarter partitions
+partitions_def = StaticPartitionsDefinition(
+    [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)]
+)
+
+
+def sec10k_extraction_asset_factory(
+    name: str,
+    sec10k_extractor: Sec10kExtractor,
+    partitions_def=None,
+    filing_metadata_asset_name: str = "sec10k_filing_metadata",
+    extraction_metadata_asset_name: str = "extraction_metadata",
+    extracted_asset_name: str = "extraction_metadata",
+):
+    """Create asset to extract data from sec10k data.
+
+    Args:
+        name: Name of extraction asset.
+        sec10k_extractor: Subclass of Sec10kExtractor used to extract data.
+        partitions_def: Partitions for asset (production uses year_quarter parts,
+            validation is not partitioned.
+        filing_metadata_asset_name: Name of input asset with metadata of filings to
+            extract.
+        extraction_metadata_asset_name: Name of output asset containing metadata
+            from extraction run.
+        extracted_asset_name: Name of output asset containing extracted data.
+    """
+
+    @multi_asset(
+        name=name,
+        outs={
+            extraction_metadata_asset_name: AssetOut(),
+            extracted_asset_name: AssetOut(),
+        },
+        ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
+        partitions_def=partitions_def,
+    )
+    def extract_filings(
+        sec10k_extractor: Sec10kExtractor, sec10k_filing_metadata: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Run Sec10kExtractor on selected partition and return."""
+        extraction_metadata, extracted = sec10k_extractor.extract_filings(
+            sec10k_filing_metadata
+        )
+        return extraction_metadata, extracted
+
+    return with_resources([extract_filings], {"sec10k_extractor": sec10k_extractor})[0]
+
+
+@asset(partitions_def=partitions_def)
+def sec10k_filing_metadata(
+    context: AssetExecutionContext,
+    cloud_interface: GCSArchive,
+) -> pd.DataFrame:
+    """Return filing metadata for year_quarter partition."""
+    year_quarter = context.partition_key
+    df = cloud_interface.get_metadata(year_quarter=year_quarter)
+    return df
+
+
+# Create asset to load basic 10k validation data
+basic_10k_validation_set = validation.load_validation_data_asset_factory(
+    "basic_10k_validation_set",
+    "basic_10k_labels.csv",
+    index_cols=["filename", "filer_count", "block", "block_count", "key"],
+)
+
+
+# Create asset to compute precision/recall on basic 10k extraction of validation set
+basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
+basic_10k_extraction_validation_metrics = (
+    validation.pandas_precision_recall_asset_factory(
+        validation_asset="basic_10k_validation_set",
+        computed_asset=basic_10k_extracted_validation_asset_name,
+        value_col="value",
+    )
+)
+
+
+@asset(name="sec10k_filing_metadata_validation")
+def basic_10k_validation_filing_metadata(
+    cloud_interface: GCSArchive,
+    basic_10k_validation_set: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[
+        filing_metadata["filename"].isin(
+            basic_10k_validation_set.index.get_level_values("filename").unique()
+        )
+    ]
+
+
+# Register basic 10k extraction pipeline
+create_production_pipeline(
+    "basic_10k_extraction",
+    [
+        sec10k_filing_metadata,
+        sec10k_extraction_asset_factory(
+            "basic_10k",
+            Basic10kExtractor(cloud_interface=cloud_interface_resource),
+            partitions_def=partitions_def,
+            extraction_metadata_asset_name="basic_10k_extraction_metadata",
+            extracted_asset_name="basic_10k_company_info",
+        ),
+    ],
+    resources={"cloud_interface": cloud_interface_resource},
+)
+
+
+# Register basic 10k extraction validation pipeline
+create_validation_pipeline(
+    "basic_10k_extraction",
+    [
+        basic_10k_validation_filing_metadata,
+        sec10k_extraction_asset_factory(
+            "basic_10k",
+            Basic10kExtractor(cloud_interface=cloud_interface_resource),
+            filing_metadata_asset_name="sec10k_filing_metadata_validation",
+            extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
+            extracted_asset_name=basic_10k_extracted_validation_asset_name,
+        ),
+        basic_10k_validation_set,
+        basic_10k_extraction_validation_metrics,
+    ],
+    resources={"cloud_interface": cloud_interface_resource},
+)
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 1232f45..81c7a35 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -170,7 +170,6 @@ class GCSArchive(ConfigurableResource):
     _filings_bucket = PrivateAttr()
     _labels_bucket = PrivateAttr()
     _engine = PrivateAttr()
-    _metadata_df = PrivateAttr(default=None)
 
     def setup_for_execution(self, context):
         """Initialize interface to filings archive on GCS."""
@@ -214,16 +213,13 @@ def create_session(self) -> Session:
         with Session(self._engine) as session:
             yield session
 
-    def get_metadata(self, filenames: list[str] | None = None) -> pd:
+    def get_metadata(self, year_quarter: str | None = None) -> pd:
         """Return dataframe of filing metadata."""
-        if self._metadata_df is None:
-            selection = select(Sec10kMetadata)
-            if filenames is not None:
-                selection = selection.where(Sec10kMetadata.filename.in_(filenames))
+        selection = select(Sec10kMetadata)
+        if year_quarter is not None:
+            selection = selection.where(Sec10kMetadata.year_quarter == year_quarter)
 
-            self._metadata_df = pd.read_sql(selection, self._engine)
-
-        return self._metadata_df
+        return pd.read_sql(selection, self._engine)
 
     def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob:
         """Return Blob pointing to file in GCS bucket."""
diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py
index 42cf090..3a427c4 100644
--- a/src/mozilla_sec_eia/pudl_pipelines.py
+++ b/src/mozilla_sec_eia/pudl_pipelines.py
@@ -1,16 +1,32 @@
-"""Define asset jobs and configuration."""
+"""Define production pipelines for running PUDL models."""
 
 import logging
 
 import coloredlogs
-from dagster import Definitions
+from dagster import Definitions, EnvVar
 
-from mozilla_sec_eia.library import get_ml_pipeline_jobs
+from mozilla_sec_eia.library.mlflow import MlflowInterface
+from mozilla_sec_eia.library.pipeline import (
+    PUDL_PIPELINE_PRODUCTION_ASSETS,
+    PUDL_PIPELINE_PRODUCTION_JOBS,
+    PUDL_PIPELINE_PRODUCTION_RESOURCES,
+)
 
 logger = logging.getLogger("catalystcoop")
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
-defs = Definitions(
-    jobs=get_ml_pipeline_jobs(),
+
+mlflow_interface = MlflowInterface(
+    experiment_name="",
+    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+    project=EnvVar("GCS_PROJECT"),
+)
+
+production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES
+
+production_pipelines = Definitions(
+    assets=PUDL_PIPELINE_PRODUCTION_ASSETS,
+    jobs=PUDL_PIPELINE_PRODUCTION_JOBS,
+    resources=production_io_resources | {"mlflow_interface": mlflow_interface},
 )
diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py
new file mode 100644
index 0000000..44e7a55
--- /dev/null
+++ b/src/mozilla_sec_eia/pudl_validation_pipelines.py
@@ -0,0 +1,32 @@
+"""Define jobs to test/validate PUDL models."""
+
+import logging
+
+import coloredlogs
+from dagster import Definitions
+
+from mozilla_sec_eia.library.mlflow import MlflowInterface, get_mlflow_io_manager
+from mozilla_sec_eia.library.pipeline import (
+    PUDL_PIPELINE_VALIDATION_ASSETS,
+    PUDL_PIPELINE_VALIDATION_JOBS,
+    PUDL_PIPELINE_VALIDATION_RESOURCES,
+)
+
+logger = logging.getLogger("catalystcoop")
+log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
+coloredlogs.install(fmt=log_format, logger=logger)
+
+
+# Configure at launch so experiment name can be supplied by config
+mlflow_interface = MlflowInterface.configure_at_launch()
+
+validation_io_resources = {
+    key: get_mlflow_io_manager(key, mlflow_interface=mlflow_interface)
+    for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"]
+} | PUDL_PIPELINE_VALIDATION_RESOURCES
+
+validation_pipelines = Definitions(
+    assets=PUDL_PIPELINE_VALIDATION_ASSETS,
+    jobs=PUDL_PIPELINE_VALIDATION_JOBS,
+    resources=validation_io_resources | {"mlflow_interface": mlflow_interface},
+)
diff --git a/tests/conftest.py b/tests/conftest.py
index 4e9b0f6..bcdedee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,7 +5,7 @@
 
 import mlflow
 import pytest
-from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker
+from mozilla_sec_eia.library.mlflow import MlflowInterface
 
 logger = logging.getLogger(__name__)
 
@@ -37,8 +37,8 @@ def test_dir() -> Path:
     return Path(__file__).parent
 
 
-class TestTracker(ExperimentTracker):
-    """Create sub-class of `ExperimentTracker` to use in testing context.
+class TestTracker(MlflowInterface):
+    """Create sub-class of `MlflowInterface` to use in testing context.
 
     Test class creates an in-memory sqlite db for tracking, and a temporary directory
     for artifact storage.
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 0a12a17..9a25fdb 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -1,119 +1,81 @@
 """Test extraction tools/methods."""
 
 import logging
+from unittest.mock import Mock
 
 import pandas as pd
-import pytest
-from dagster import Out, RunConfig, op
-from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import (
-    MlflowMetricsIOManager,
-    MlflowPandasArtifactIOManager,
-)
-from mozilla_sec_eia.models.sec10k.extract import (
-    FilingsToExtractConfig,
-    extract_graph_factory,
+from dagster import asset, build_asset_context, materialize
+from mozilla_sec_eia.models.sec10k.extract import Sec10kExtractor
+from mozilla_sec_eia.models.sec10k.pipeline import (
+    sec10k_extraction_asset_factory,
+    sec10k_filing_metadata,
 )
+from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-@pytest.mark.parametrize(
-    "filings_metadata,previous_extraction_metadata,num_filings,num_failed",
-    [
-        (
-            pd.DataFrame(
-                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
-            ),
-            pd.DataFrame(
-                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-            ).set_index("filename"),
-            -1,
-            0,
-        ),
-        (
-            pd.DataFrame(
-                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
-            ),
-            pd.DataFrame(
-                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-            ).set_index("filename"),
-            -1,
-            3,
-        ),
-        (
-            pd.DataFrame(
-                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
-            ),
-            pd.DataFrame(
-                {"filename": ["filing1", "filing2"], "success": [True, True]}
-            ).set_index("filename"),
-            -1,
-            0,
-        ),
-        (
-            pd.DataFrame(
-                {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]}
-            ),
-            pd.DataFrame(
-                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-            ).set_index("filename"),
-            2,
-            1,
-        ),
-    ],
-)
-def test_sec10k_extract_pipeline(
-    filings_metadata,
-    previous_extraction_metadata,
-    num_filings,
-    num_failed,
-    test_tracker_factory,
-    get_most_recent_mlflow_run_factory,
-):
-    """Test high level extraction workflow."""
+def test_sec10k_filing_metadata():
+    """Test loading sec10k filing metadata."""
+    # Prepare inputs to sec10k_filing_metadata
+    context = build_asset_context(partition_key="2024q1")
+    cloud_interface = Mock()
+    output_df = pd.DataFrame({"col": ["fake_col"]})
+    cloud_interface.get_metadata.return_value = output_df
+
+    returned_df = sec10k_filing_metadata(
+        context=context,
+        cloud_interface=cloud_interface,
+    )
 
-    @op(out={"extraction_metadata": Out(), "extracted": Out()})
-    def test_extract(
-        filings_to_extract: pd.DataFrame,
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        md = filings_to_extract
-        md["success"] = True
-        md.iloc[:num_failed, 1] = False
-        return md.set_index("filename"), pd.DataFrame()
+    # Check that GCSArchive.get_metadata was called correctly
+    cloud_interface.get_metadata.assert_called_once_with(year_quarter="2024q1")
+    pd.testing.assert_frame_equal(returned_df, output_df)
 
-    dataset_name = "test_pipeline"
-    experiment_name = f"{dataset_name}_extraction"
-    test_tracker = test_tracker_factory(experiment_name)
 
-    test_graph = extract_graph_factory("test_extract", test_extract)
-    resources = {
-        "experiment_tracker": test_tracker,
-        "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager(
-            experiment_tracker=test_tracker
-        ),
-        "mlflow_metrics_io_manager": MlflowMetricsIOManager(
-            experiment_tracker=test_tracker,
-        ),
-    }
-    graph_result = test_graph.to_job().execute_in_process(
-        resources=resources,
-        run_config=RunConfig(
-            {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)}
-        ),
-        input_values={
-            "metadata": filings_metadata,
-            "previous_extraction_metadata": previous_extraction_metadata,
-            "previous_extracted": pd.DataFrame(),
-        },
-    )
-    extraction_metadata, metrics = (
-        graph_result.output_value("extraction_metadata"),
-        graph_result.output_value("extraction_metrics"),
+def test_sec10k_extraction():
+    """Test loading sec10k filing metadata."""
+    fake_extraction_metadata = pd.DataFrame({"extraction_metadata": ["fake_col"]})
+    fake_extracted = pd.DataFrame({"extracted": ["fake_col"]})
+    fake_filing_metadata = pd.DataFrame({"filing_metadata": ["fake_col"]})
+
+    # Create fake Sec10kExtractor
+    class TestSec10kExtractor(Sec10kExtractor):
+        def extract_filings(self, filing_metadata):
+            pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata)
+            return fake_extraction_metadata, fake_extracted
+
+    # Create fake GCSArchive
+    class FakeArchive(GCSArchive):
+        filings_bucket_name: str = ""
+        labels_bucket_name: str = ""
+        metadata_db_instance_connection: str = ""
+        user: str = ""
+        metadata_db_name: str = ""
+        project: str = ""
+
+        def setup_for_execution(self, context):
+            pass
+
+    # Asset to return fake filing metadata
+    @asset
+    def fake_filing_metadata_asset():
+        return fake_filing_metadata
+
+    # Create fake extraction asset with configured inputs
+    extraction_multi_asset = sec10k_extraction_asset_factory(
+        name="test_sec10k_extraction",
+        sec10k_extractor=TestSec10kExtractor(cloud_interface=FakeArchive()),
+        filing_metadata_asset_name="fake_filing_metadata_asset",
+        extracted_asset_name="test_sec10k_extraction",
+        extraction_metadata_asset_name="test_sec10k_extraction_metadata",
     )
 
-    run = get_most_recent_mlflow_run_factory(experiment_name)
-    assert run.data.metrics["num_failed"] == num_failed
-    assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len(
-        filings_metadata
+    # Run assets and review results
+    result = materialize([fake_filing_metadata_asset, extraction_multi_asset])
+    pd.testing.assert_frame_equal(
+        result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata
+    )
+    pd.testing.assert_frame_equal(
+        result.asset_value("test_sec10k_extraction"), fake_extracted
     )
-    assert run.data.metrics == metrics

From f20fb7dcb7a11bd89ede8538b8a950a412a6b22d Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 2 Sep 2024 15:39:30 -0400
Subject: [PATCH 018/161] Remove old comment

---
 src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index e78f627..75a67f7 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -20,7 +20,6 @@ class MlflowBaseIOManager(ConfigurableIOManager):
     """Specify base config and implement helper functions for mlflow io-managers."""
 
     mlflow_interface: MlflowInterface
-    #: By default handles artifacts from current run, but can be used with previous run.
 
     def _get_run_info(self) -> Run:
         """Get mlflow `Run` object using current run id."""

From 92e2e009e395eefc4de960c29e4d03dd04c9b359 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 09:45:57 -0400
Subject: [PATCH 019/161] Add ex21 to dagster jobs

---
 .../library/mlflow/__init__.py                |  10 +
 .../library/mlflow/mlflow_io_managers.py      |   2 +-
 .../library/mlflow/validation.py              |  88 ------
 .../library/validation_helpers.py             |  81 ++++++
 .../models/sec10k/basic_10k.py                |   2 +
 .../models/sec10k/ex_21/inference.py          | 183 +++++++------
 .../models/sec10k/ex_21/train_extractor.py    |   4 +-
 src/mozilla_sec_eia/models/sec10k/extract.py  |   1 +
 src/mozilla_sec_eia/models/sec10k/pipeline.py | 252 +++++++++++++++---
 .../models/sec10k/utils/layoutlm.py           |  54 ++--
 src/mozilla_sec_eia/pudl_pipelines.py         |  14 +-
 .../pudl_validation_pipelines.py              |  16 +-
 tests/unit/models/sec10k/extract_test.py      |   2 +
 13 files changed, 477 insertions(+), 232 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/library/mlflow/validation.py
 create mode 100644 src/mozilla_sec_eia/library/validation_helpers.py

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index 380a63c..c5e6642 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -1,5 +1,7 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
+from dagster import EnvVar
+
 from .mlflow_io_managers import (
     MlflowBaseIOManager,
     MlflowMetricsIOManager,
@@ -10,6 +12,14 @@
     get_most_recent_run,
 )
 
+mlflow_production_interface = MlflowInterface(
+    experiment_name="",
+    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
+    project=EnvVar("GCS_PROJECT"),
+    tracking_enabled=False,
+)
+mlflow_train_test_interface = MlflowInterface.configure_at_launch()
+
 
 def get_mlflow_io_manager(
     key: str,
diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 75a67f7..7aa05d7 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -94,7 +94,7 @@ def handle_output(self, context: OutputContext, obj: dict[str, float]):
         """Load metrics to mlflow run/experiment created by `MlflowInterface`."""
         mlflow.log_metrics(obj)
 
-    def load_input(self, context: OutputContext) -> dict[str, float]:
+    def load_input(self, context: InputContext) -> dict[str, float]:
         """Log metrics to mlflow run/experiment created by `MlflowInterface`."""
         run = self._get_run_info()
         return run.data.metrics
diff --git a/src/mozilla_sec_eia/library/mlflow/validation.py b/src/mozilla_sec_eia/library/mlflow/validation.py
deleted file mode 100644
index 999cbdd..0000000
--- a/src/mozilla_sec_eia/library/mlflow/validation.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""Implement common utilities/functions for validating models."""
-
-from importlib import resources
-
-import pandas as pd
-from dagster import AssetIn, AssetsDefinition, asset
-
-
-def load_validation_data_asset_factory(
-    asset_name: str,
-    filename: str,
-    index_cols: str | list[str] | None = None,
-) -> AssetsDefinition:
-    """Construct asset for loading validation data from CSV in `package_data`."""
-
-    @asset(
-        name=asset_name,
-        io_manager_key="mlflow_pandas_artifact_io_manager",
-    )
-    def load_validation_data() -> pd.DataFrame:
-        """Load csv with validation data from `package_data` directory."""
-        df = pd.read_csv(
-            resources.files("mozilla_sec_eia.package_data.validation_data") / filename
-        )
-        if index_cols is not None:
-            df = df.set_index(index_cols)
-        return df
-
-    return load_validation_data
-
-
-def pandas_precision_recall_asset_factory(
-    validation_asset: str,
-    computed_asset: str,
-    value_col: str,
-) -> AssetsDefinition:
-    """Produce asset to compute precision and recall on pandas dataframe.
-
-    The returned asset will take upstream computed/validation assets and compute
-    precision/recall on `value_col`.
-
-    Arg:
-        validation_asset: Upstream asset containing dataframe of validation set.
-        computed_asset: Upstream asset containing dataframe of computed data.
-        value_col: Column to compare when computing metrics.
-    """
-
-    @asset(
-        ins={
-            "computed_set": AssetIn(computed_asset),
-            "validation_set": AssetIn(validation_asset),
-        },
-        io_manager_key="mlflow_metrics_io_manager",
-    )
-    def pandas_compute_precision_recall(
-        computed_set: pd.DataFrame,
-        validation_set: pd.DataFrame,
-    ) -> dict:
-        """Asset which will return computed metrics from dataframes."""
-        # Get initial length of both sets
-        computed_len = len(computed_set)
-        validation_len = len(validation_set)
-
-        # Get index of rows only in one set and make Null in other set
-        idx_validation_only = validation_set.index.difference(computed_set.index)
-        padded_compute_set = pd.concat(
-            [
-                computed_set[value_col],
-                pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
-            ]
-        ).sort_index()
-        idx_compute_only = computed_set.index.difference(validation_set.index)
-        padded_validation_set = pd.concat(
-            [
-                validation_set[value_col],
-                pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
-            ]
-        ).sort_index()
-
-        true_positives = (padded_compute_set == padded_validation_set).sum()
-
-        return {
-            "precision": true_positives / computed_len,
-            "recall": true_positives / validation_len,
-        }
-
-    # Return new asset
-    return pandas_compute_precision_recall
diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py
new file mode 100644
index 0000000..62c1825
--- /dev/null
+++ b/src/mozilla_sec_eia/library/validation_helpers.py
@@ -0,0 +1,81 @@
+"""Implement common utilities/functions for validating models."""
+
+from importlib import resources
+
+import pandas as pd
+
+
+def load_validation_data(
+    filename: str, index_cols: list[str] | None = None
+) -> pd.DataFrame:
+    """Load csv with validation data from `package_data` directory."""
+    df = pd.read_csv(
+        resources.files("mozilla_sec_eia.package_data.validation_data") / filename
+    )
+    if index_cols is not None:
+        df = df.set_index(index_cols)
+    return df
+
+
+def pandas_compute_precision_recall(
+    computed_set: pd.DataFrame,
+    validation_set: pd.DataFrame,
+    value_col: str,
+) -> dict:
+    """Asset which will return computed metrics from dataframes."""
+    # Get initial length of both sets
+    computed_len = len(computed_set)
+    validation_len = len(validation_set)
+
+    # Get index of rows only in one set and make Null in other set
+    idx_validation_only = validation_set.index.difference(computed_set.index)
+    padded_compute_set = pd.concat(
+        [
+            computed_set[value_col],
+            pd.Series([None] * len(idx_validation_only), index=idx_validation_only),
+        ]
+    ).sort_index()
+    idx_compute_only = computed_set.index.difference(validation_set.index)
+    padded_validation_set = pd.concat(
+        [
+            validation_set[value_col],
+            pd.Series([None] * len(idx_compute_only), index=idx_compute_only),
+        ]
+    ).sort_index()
+
+    true_positives = (padded_compute_set == padded_validation_set).sum()
+
+    return {
+        "precision": true_positives / computed_len,
+        "recall": true_positives / validation_len,
+    }
+
+
+def jaccard_similarity(
+    computed_df: pd.DataFrame, validation_df: pd.DataFrame, value_col: str
+) -> float:
+    """Get the Jaccard similarity between two Series.
+
+    Calculated as the intersection of the set divided
+    by the union of the set.
+
+    Args:
+        computed_df: Extracted data.
+        validation_df: Expected extraction results.
+        value_col: Column to calculate Jaccard similarity on.
+            Must be present in both dataframes.
+    """
+    # fill nans to make similarity comparison more accurate
+    if (computed_df[value_col].dtype == float) and (
+        validation_df[value_col].dtype == float
+    ):
+        computed_df[value_col] = computed_df[value_col].fillna(999)
+        validation_df[value_col] = validation_df[value_col].fillna(999)
+    else:
+        computed_df[value_col] = computed_df[value_col].fillna("zzz")
+        validation_df[value_col] = validation_df[value_col].fillna("zzz")
+    intersection = set(computed_df[value_col]).intersection(
+        set(validation_df[value_col])
+    )
+    union = set(computed_df[value_col]).union(set(validation_df[value_col]))
+    return float(len(intersection)) / float(len(union))
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index dfc2fc8..df57ac5 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -13,6 +13,8 @@
 class Basic10kExtractor(Sec10kExtractor):
     """Implement Sec10kExtractor for basic 10k company info data."""
 
+    name: str = "basic_10k_extractor"
+
     def _extract_10k(self, filing: Sec10K):
         """Extract basic company data from filing."""
         logger.info(f"Extracting 10K company data from filing: {filing.filename}")
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 2016630..56648eb 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -2,22 +2,25 @@
 
 import logging
 import os
+import tempfile
+from contextlib import contextmanager
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
 import torch
 from datasets import Dataset
+from pydantic import PrivateAttr
 from transformers import (
-    AutoProcessor,
-    LayoutLMv3ForTokenClassification,
     Pipeline,
     pipeline,
 )
 from transformers.tokenization_utils_base import BatchEncoding
 
+from ..extract import Sec10kExtractor
 from ..utils.cloud import get_metadata_filename
 from ..utils.layoutlm import (
+    LayoutlmResource,
     get_id_label_conversions,
     iob_to_label,
     normalize_bboxes,
@@ -184,84 +187,106 @@ def _get_data(dataset):
     yield from dataset
 
 
-def perform_inference(
-    pdfs_dir: Path,
-    model: LayoutLMv3ForTokenClassification,
-    processor: AutoProcessor,
-    extraction_metadata: pd.DataFrame,
-    dataset_ind: list = None,
-    labeled_json_dir: Path = None,
-    has_labels: bool = False,
-    device="cpu",
-):
-    """Predict entities with a fine-tuned model and extract Ex. 21 tables.
-
-    This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir`
-    that the model can then perform inference on (`create_inference_dataset`).
-    Then it creates an instance of the custom LayoutLM inference pipeline and
-    runs the dataset through the pipeline. The pipeline outputs logits, predictions,
-    and an output dataframe with extracted Ex. 21 table.
-
-    Arguments:
-        pdfs_dir: Path to the directory with PDFs that are being used for inference.
-        model: A fine-tuned LayoutLM model.
-        processor: The tokenizer and encoder for model inputs.
-        extraction_metadata: A dataframe to track extraction success metrics. Should
-            have columns 'filename' and 'success'.
-        dataset_ind: A list of index numbers of dataset records to be used for inference
-            Default is None, in which the entire dataset created from the PDF directory
-            is used.
-        labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot
-            be None if has_labels is True.
-        has_labels: Boolean, true if the data has associated labels that can be used in
-            visualizing and validating results.
-        device: String or int, specify what computation device to use for inference
-            i.e. "mps", "cpu", "cuda"
-
-    Returns:
-        logits: A list of logits. The list is the length of the number of documents in the
-            dataset (number of PDFs in pdfs_dir). Each logit object in the list is of
-            shape (batch_size, seq_len, num_labels). Seq_len is
-            the same as token length (512 in this case).
-        predictions: A list of predictions. The list is the length of the number of documents
-            in the dataset (number of PDFs in pdfs_dir).
-            From the logits, we take the highest score for each token, using argmax.
-            This serves as the predicted label for each token. It is shape (seq_len) or token
-            length.
-        output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column
-            that is the filename of the extracted Ex. 21. Dataframe contains columns id,
-            subsidiary, loc, own_per.
-    """
-    dataset = create_inference_dataset(
-        pdfs_dir=pdfs_dir, labeled_json_dir=labeled_json_dir, has_labels=has_labels
-    )
-    if dataset_ind:
-        dataset = dataset.select(dataset_ind)
-
-    # TODO: figure out device argument
-    pipe = pipeline(
-        "token-classification",
-        model=model,
-        tokenizer=processor,
-        pipeline_class=LayoutLMInferencePipeline,
-        device=device,
-    )
+class Exhibit21Extractor(Sec10kExtractor):
+    """Implement `Sec10kExtractor` interface for exhibit 21 data."""
+
+    layoutlm: LayoutlmResource
+    name: str = "exhibit21_extractor"
+    device: str = "cpu"
+    has_labels: bool = False
+    dataset_ind: list | None = None
+    _pdf_dir: Path = PrivateAttr()
+    _labeled_json_dir: Path | None = PrivateAttr(default=None)
+
+    @contextmanager
+    def yield_for_execution(self, context):
+        """Setup temp path working directories."""
+        with (
+            tempfile.TemporaryDirectory() as pdf_dir,
+            tempfile.TemporaryDirectory() as labeled_json_dir,
+        ):
+            self._pdf_dir = pdf_dir
+            if self.has_labels:
+                self._labeled_json_dir = labeled_json_dir
+            yield self
+
+    def extract_filings(
+        self, filing_metadata: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Predict entities with a fine-tuned model and extract Ex. 21 tables.
+
+        This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir`
+        that the model can then perform inference on (`create_inference_dataset`).
+        Then it creates an instance of the custom LayoutLM inference pipeline and
+        runs the dataset through the pipeline. The pipeline outputs logits, predictions,
+        and an output dataframe with extracted Ex. 21 table.
+
+        Arguments:
+            pdfs_dir: Path to the directory with PDFs that are being used for inference.
+            model: A fine-tuned LayoutLM model.
+            processor: The tokenizer and encoder for model inputs.
+            extraction_metadata: A dataframe to track extraction success metrics. Should
+                have columns 'filename' and 'success'.
+            dataset_ind: A list of index numbers of dataset records to be used for inference
+                Default is None, in which the entire dataset created from the PDF directory
+                is used.
+            labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot
+                be None if has_labels is True.
+            has_labels: Boolean, true if the data has associated labels that can be used in
+                visualizing and validating results.
+            device: String or int, specify what computation device to use for inference
+                i.e. "mps", "cpu", "cuda"
+
+        Returns:
+            logits: A list of logits. The list is the length of the number of documents in the
+                dataset (number of PDFs in pdfs_dir). Each logit object in the list is of
+                shape (batch_size, seq_len, num_labels). Seq_len is
+                the same as token length (512 in this case).
+            predictions: A list of predictions. The list is the length of the number of documents
+                in the dataset (number of PDFs in pdfs_dir).
+                From the logits, we take the highest score for each token, using argmax.
+                This serves as the predicted label for each token. It is shape (seq_len) or token
+                length.
+            output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column
+                that is the filename of the extracted Ex. 21. Dataframe contains columns id,
+                subsidiary, loc, own_per.
+        """
+        dataset = create_inference_dataset(
+            pdfs_dir=self._pdf_dir,
+            labeled_json_dir=self._labeled_json_dir,
+            has_labels=self.has_labels,
+        )
+        if self.dataset_ind:
+            dataset = dataset.select(self.dataset_ind)
+
+        # TODO: figure out device argument
+        model, processor = self.layoutlm.get_model_components()
+        pipe = pipeline(
+            "token-classification",
+            model=model,
+            tokenizer=processor,
+            pipeline_class=LayoutLMInferencePipeline,
+            device=self.device,
+        )
 
-    logits = []
-    predictions = []
-    all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
-    for logit, pred, output_df in pipe(_get_data(dataset)):
-        logits.append(logit)
-        predictions.append(pred)
-        if not output_df.empty:
-            filename = get_metadata_filename(output_df["id"].iloc[0])
-            extraction_metadata.loc[filename, ["success"]] = True
-        all_output_df = pd.concat([all_output_df, output_df])
-    all_output_df.columns.name = None
-    all_output_df = clean_extracted_df(all_output_df)
-    all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
-    all_output_df = all_output_df.reset_index(drop=True)
-    return logits, predictions, all_output_df, extraction_metadata
+        logits = []
+        predictions = []
+        all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
+        extraction_metadata = pd.DataFrame(
+            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+        ).set_index("filename")
+        for logit, pred, output_df in pipe(_get_data(dataset)):
+            logits.append(logit)
+            predictions.append(pred)
+            if not output_df.empty:
+                filename = get_metadata_filename(output_df["id"].iloc[0])
+                extraction_metadata.loc[filename, ["success"]] = True
+            all_output_df = pd.concat([all_output_df, output_df])
+        all_output_df.columns.name = None
+        all_output_df = clean_extracted_df(all_output_df)
+        all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
+        all_output_df = all_output_df.reset_index(drop=True)
+        return logits, predictions, all_output_df, extraction_metadata
 
 
 class LayoutLMInferencePipeline(Pipeline):
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
index 53ed85e..00c80f3 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
@@ -27,7 +27,7 @@
 )
 from transformers.data.data_collator import default_data_collator
 
-from ..utils.layoutlm import get_id_label_conversions, log_model
+from ..utils.layoutlm import get_id_label_conversions
 from .create_labeled_dataset import format_as_ner_annotations
 
 LABELS = [
@@ -191,4 +191,4 @@ def train_model(
     # Train inside mlflow run. Mlflow will automatically handle logging training metrcis
     with mlflow.start_run():
         trainer.train()
-        log_model(trainer)
+        # log_model(trainer)
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index ad3760d..4461b49 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -10,6 +10,7 @@ class Sec10kExtractor(ConfigurableResource):
     """Base class for extracting SEC 10k data."""
 
     cloud_interface: GCSArchive
+    name: str
 
     def extract_filings(
         self, filing_metadata: pd.DataFrame
diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py
index e4b7d0d..5604d6a 100644
--- a/src/mozilla_sec_eia/models/sec10k/pipeline.py
+++ b/src/mozilla_sec_eia/models/sec10k/pipeline.py
@@ -14,15 +14,21 @@
     with_resources,
 )
 
-from mozilla_sec_eia.library.mlflow import validation
+from mozilla_sec_eia.library import validation_helpers
+from mozilla_sec_eia.library.mlflow import (
+    mlflow_production_interface,
+    mlflow_train_test_interface,
+)
 from mozilla_sec_eia.library.pipeline import (
     create_production_pipeline,
     create_validation_pipeline,
 )
 
 from .basic_10k import Basic10kExtractor
+from .ex_21.inference import Exhibit21Extractor, clean_extracted_df
 from .extract import Sec10kExtractor
-from .utils.cloud import GCSArchive, cloud_interface_resource
+from .utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
+from .utils.layoutlm import LayoutlmResource
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -45,6 +51,17 @@ class ExtractionMetadataSchema(pa.DataFrameModel):
 )
 
 
+@asset(partitions_def=partitions_def)
+def sec10k_filing_metadata(
+    context: AssetExecutionContext,
+    cloud_interface: GCSArchive,
+) -> pd.DataFrame:
+    """Return filing metadata for year_quarter partition."""
+    year_quarter = context.partition_key
+    df = cloud_interface.get_metadata(year_quarter=year_quarter)
+    return df
+
+
 def sec10k_extraction_asset_factory(
     name: str,
     sec10k_extractor: Sec10kExtractor,
@@ -75,50 +92,55 @@ def sec10k_extraction_asset_factory(
         },
         ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
         partitions_def=partitions_def,
+        required_resource_keys={sec10k_extractor.name},
     )
     def extract_filings(
-        sec10k_extractor: Sec10kExtractor, sec10k_filing_metadata: pd.DataFrame
+        context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
         """Run Sec10kExtractor on selected partition and return."""
-        extraction_metadata, extracted = sec10k_extractor.extract_filings(
+        extractor = context.resources.original_resource_dict[sec10k_extractor.name]
+        extraction_metadata, extracted = extractor.extract_filings(
             sec10k_filing_metadata
         )
         return extraction_metadata, extracted
 
-    return with_resources([extract_filings], {"sec10k_extractor": sec10k_extractor})[0]
+    return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[
+        0
+    ]
 
 
-@asset(partitions_def=partitions_def)
-def sec10k_filing_metadata(
-    context: AssetExecutionContext,
-    cloud_interface: GCSArchive,
-) -> pd.DataFrame:
-    """Return filing metadata for year_quarter partition."""
-    year_quarter = context.partition_key
-    df = cloud_interface.get_metadata(year_quarter=year_quarter)
-    return df
+@asset
+def basic_10k_validation_set() -> pd.DataFrame:
+    """Return dataframe containing basic 10k validation data."""
+    return validation_helpers.load_validation_data(
+        "basic_10k_labels.csv",
+        index_cols=["filename", "filer_count", "block", "block_count", "key"],
+    )
 
 
-# Create asset to load basic 10k validation data
-basic_10k_validation_set = validation.load_validation_data_asset_factory(
-    "basic_10k_validation_set",
-    "basic_10k_labels.csv",
-    index_cols=["filename", "filer_count", "block", "block_count", "key"],
-)
+basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
 
 
-# Create asset to compute precision/recall on basic 10k extraction of validation set
-basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
-basic_10k_extraction_validation_metrics = (
-    validation.pandas_precision_recall_asset_factory(
-        validation_asset="basic_10k_validation_set",
-        computed_asset=basic_10k_extracted_validation_asset_name,
-        value_col="value",
-    )
+@asset(
+    ins={
+        basic_10k_extracted_validation_asset_name: AssetIn(
+            basic_10k_extracted_validation_asset_name
+        ),
+        "basic_10k_validation_set": AssetIn(),
+    },
+    io_manager_key="mlflow_metrics_io_manager",
 )
+def basic_10k_extraction_validation_metrics(**kwargs):
+    """Compute basic 10k extraction validation metrics."""
+    computed = kwargs[basic_10k_extracted_validation_asset_name]
+    validation = kwargs["basic_10k_validation_set"]
+
+    return validation_helpers.pandas_compute_precision_recall(
+        computed, validation, value_col="value"
+    )
 
 
-@asset(name="sec10k_filing_metadata_validation")
+@asset
 def basic_10k_validation_filing_metadata(
     cloud_interface: GCSArchive,
     basic_10k_validation_set: pd.DataFrame,
@@ -157,7 +179,7 @@ def basic_10k_validation_filing_metadata(
         sec10k_extraction_asset_factory(
             "basic_10k",
             Basic10kExtractor(cloud_interface=cloud_interface_resource),
-            filing_metadata_asset_name="sec10k_filing_metadata_validation",
+            filing_metadata_asset_name="basic_10k_validation_filing_metadata",
             extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
             extracted_asset_name=basic_10k_extracted_validation_asset_name,
         ),
@@ -166,3 +188,173 @@ def basic_10k_validation_filing_metadata(
     ],
     resources={"cloud_interface": cloud_interface_resource},
 )
+
+
+@asset
+def ex21_validation_set() -> pd.DataFrame:
+    """Return dataframe containing basic 10k validation data."""
+    return clean_ex21_validation_set(
+        validation_helpers.load_validation_data("ex21_labels.csv")
+    )
+
+
+@asset
+def ex21_validation_filing_metadata(
+    cloud_interface: GCSArchive,
+    ex21_validation_set: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[
+        filing_metadata["filename"].isin(
+            ex21_validation_set.index.get_level_values("filename").unique()
+        )
+    ]
+
+
+ex21_extracted_validation_asset_name = "ex21_validation"
+
+
+@multi_asset(
+    ins={
+        "computed_df": AssetIn(ex21_extracted_validation_asset_name),
+        "validation_df": AssetIn("ex21_validation_set"),
+    },
+    outs={
+        "ex21_jaccard_per_table": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_precision_recall_per_table": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_incorrect_filenames": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"),
+    },
+)
+def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
+    """Compute validation metrics for Ex. 21 extraction."""
+    shared_cols = validation_df.columns.intersection(computed_df.columns)
+    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
+    n_equal = 0
+    validation_filenames = validation_df["id"].unique()
+    n_files = len(validation_filenames)
+    table_metrics_dict = {}
+    jaccard_dict = {}
+    incorrect_files = []
+    # iterate through each file and check each extracted table
+    for filename in validation_filenames:
+        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
+            drop=True
+        )
+        validation_table_df = validation_df[
+            validation_df["id"] == filename
+        ].reset_index(drop=True)
+        # check if the tables are exactly equal
+        if extracted_table_df.equals(validation_table_df):
+            # TODO: strip llc and other company strings before comparison
+            n_equal += 1
+        else:
+            incorrect_files.append(filename)
+        # compute precision and recall for each column
+        table_metrics_dict[filename] = {}
+        jaccard_dict[filename] = {}
+        for col in ["subsidiary", "loc", "own_per"]:
+            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
+                extracted_table_df, validation_table_df, value_col=col
+            )
+            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
+                "precision"
+            ]
+            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
+            # get the jaccard similarity between columns
+            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
+                computed_df=extracted_table_df,
+                validation_df=validation_table_df,
+                value_col=col,
+            )
+
+    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
+    prec_recall_df = pd.DataFrame.from_dict(
+        table_metrics_dict, orient="index"
+    ).reset_index()
+
+    return (
+        jaccard_df,
+        prec_recall_df,
+        pd.DataFrame({"filename": incorrect_files}),
+        {
+            "table_accuracy": n_equal / n_files,
+            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
+            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
+            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
+            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
+            / n_files,
+            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
+            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
+            / n_files,
+            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
+            / n_files,
+            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
+            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
+        },
+    )
+
+
+def clean_ex21_validation_set(validation_df: pd.DataFrame):
+    """Clean Ex. 21 validation data to match extracted format."""
+    validation_df = validation_df.rename(
+        columns={
+            "Filename": "id",
+            "Subsidiary": "subsidiary",
+            "Location of Incorporation": "loc",
+            "Ownership Percentage": "own_per",
+        }
+    )
+    validation_df["own_per"] = validation_df["own_per"].astype(str)
+    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
+    validation_df = clean_extracted_df(validation_df)
+    return validation_df
+
+
+# Register ex21 extraction pipeline
+create_production_pipeline(
+    "ex21_extraction",
+    [
+        sec10k_filing_metadata,
+        sec10k_extraction_asset_factory(
+            "ex21",
+            Exhibit21Extractor(
+                cloud_interface=cloud_interface_resource,
+                layoutlm=LayoutlmResource(mlflow_interface=mlflow_production_interface),
+            ),
+            partitions_def=partitions_def,
+            extraction_metadata_asset_name="ex21_extraction_metadata",
+            extracted_asset_name="ex21_company_info",
+        ),
+    ],
+    resources={"cloud_interface": cloud_interface_resource},
+)
+
+
+# Register ex21 extraction validation pipeline
+create_validation_pipeline(
+    "ex21_extraction",
+    [
+        ex21_validation_filing_metadata,
+        sec10k_extraction_asset_factory(
+            "ex21",
+            Exhibit21Extractor(
+                cloud_interface=cloud_interface_resource,
+                layoutlm=LayoutlmResource(mlflow_interface=mlflow_train_test_interface),
+            ),
+            filing_metadata_asset_name="ex21_validation_filing_metadata",
+            extraction_metadata_asset_name="ex21_extraction_validation_metadata",
+            extracted_asset_name=ex21_extracted_validation_asset_name,
+        ),
+        ex21_validation_set,
+        ex21_validation_metrics,
+    ],
+    resources={"cloud_interface": cloud_interface_resource},
+)
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index ba31fb5..1e88052 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -1,30 +1,54 @@
 """Util functions for training and predicting with LayoutLM on Ex. 21 tables."""
 
 import mlflow
+from dagster import ConfigurableResource, InputContext, OutputContext
 from PIL import ImageDraw, ImageFont
+from pydantic import PrivateAttr
 from transformers import (
     Trainer,
 )
 
+from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager, MlflowInterface
 
-def log_model(finetuned_model: Trainer):
-    """Log fine-tuned model to mlflow artifacts."""
-    model = {"model": finetuned_model.model, "tokenizer": finetuned_model.tokenizer}
-    mlflow.transformers.log_model(
-        model, artifact_path="layoutlm_extractor", task="token-classification"
-    )
 
+def _load_pretrained_layoutlm(version: str = "latest") -> dict:
+    """Function to load layoutlm from mlflow."""
+    path = f"models:/layoutlm_extractor/{version}"
 
-def load_model(version=1):
-    """Load fine-tuned model checkpoint from mlflow artifacts.
+    return mlflow.transformers.load_model(path, return_type="components")
 
-    Returns: A dictionary of the saved individual components of
-        either the Pipeline or the pre-trained model.
-    """
-    # TODO: want more ability to give load_model a model path?
-    return mlflow.transformers.load_model(
-        f"models:/layoutlm_extractor/{version}", return_type="components"
-    )
+
+class LayoutlmIOManager(MlflowBaseIOManager):
+    """Load and log models with mlflow tracking server."""
+
+    version: int | None = None
+
+    def handle_output(self, context: OutputContext, finetuned_model: Trainer):
+        """Load metrics to mlflow run/experiment created by `MlflowInterface`."""
+        model = {"model": finetuned_model.model, "tokenizer": finetuned_model.tokenizer}
+        mlflow.transformers.log_model(
+            model, artifact_path="layoutlm_extractor", task="token-classification"
+        )
+
+    def load_input(self, context: InputContext) -> dict:
+        """Log metrics to mlflow run/experiment created by `MlflowInterface`."""
+        return _load_pretrained_layoutlm(self.version)
+
+
+class LayoutlmResource(ConfigurableResource):
+    """Dagster resource for loading/using pretrained layoutlm model as a resource."""
+
+    mlflow_interface: MlflowInterface
+    version: str | None = None
+    _model_components: dict = PrivateAttr()
+
+    def setup_for_execution(self, context):
+        """Load layoutlm from mlflow."""
+        self._model_components = _load_pretrained_layoutlm(self.version)
+
+    def get_model_components(self):
+        """Return model components from loaded model."""
+        return self._model_components["model"], self._model_components["tokenizer"]
 
 
 def normalize_bboxes(txt_df, pg_meta_df):
diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py
index 3a427c4..57b3808 100644
--- a/src/mozilla_sec_eia/pudl_pipelines.py
+++ b/src/mozilla_sec_eia/pudl_pipelines.py
@@ -3,9 +3,9 @@
 import logging
 
 import coloredlogs
-from dagster import Definitions, EnvVar
+from dagster import Definitions
 
-from mozilla_sec_eia.library.mlflow import MlflowInterface
+from mozilla_sec_eia.library.mlflow import mlflow_production_interface
 from mozilla_sec_eia.library.pipeline import (
     PUDL_PIPELINE_PRODUCTION_ASSETS,
     PUDL_PIPELINE_PRODUCTION_JOBS,
@@ -16,17 +16,11 @@
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
-
-mlflow_interface = MlflowInterface(
-    experiment_name="",
-    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-    project=EnvVar("GCS_PROJECT"),
-)
-
 production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES
 
 production_pipelines = Definitions(
     assets=PUDL_PIPELINE_PRODUCTION_ASSETS,
     jobs=PUDL_PIPELINE_PRODUCTION_JOBS,
-    resources=production_io_resources | {"mlflow_interface": mlflow_interface},
+    resources=production_io_resources
+    | {"mlflow_interface": mlflow_production_interface},
 )
diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py
index 44e7a55..63e9049 100644
--- a/src/mozilla_sec_eia/pudl_validation_pipelines.py
+++ b/src/mozilla_sec_eia/pudl_validation_pipelines.py
@@ -5,7 +5,10 @@
 import coloredlogs
 from dagster import Definitions
 
-from mozilla_sec_eia.library.mlflow import MlflowInterface, get_mlflow_io_manager
+from mozilla_sec_eia.library.mlflow import (
+    get_mlflow_io_manager,
+    mlflow_train_test_interface,
+)
 from mozilla_sec_eia.library.pipeline import (
     PUDL_PIPELINE_VALIDATION_ASSETS,
     PUDL_PIPELINE_VALIDATION_JOBS,
@@ -16,17 +19,16 @@
 log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
 coloredlogs.install(fmt=log_format, logger=logger)
 
-
-# Configure at launch so experiment name can be supplied by config
-mlflow_interface = MlflowInterface.configure_at_launch()
-
 validation_io_resources = {
-    key: get_mlflow_io_manager(key, mlflow_interface=mlflow_interface)
+    key: get_mlflow_io_manager(
+        key, mlflow_interface=mlflow_train_test_interface, pandas_file_type="csv"
+    )
     for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"]
 } | PUDL_PIPELINE_VALIDATION_RESOURCES
 
 validation_pipelines = Definitions(
     assets=PUDL_PIPELINE_VALIDATION_ASSETS,
     jobs=PUDL_PIPELINE_VALIDATION_JOBS,
-    resources=validation_io_resources | {"mlflow_interface": mlflow_interface},
+    resources=validation_io_resources
+    | {"mlflow_interface": mlflow_train_test_interface},
 )
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 9a25fdb..c1cdac0 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -41,6 +41,8 @@ def test_sec10k_extraction():
 
     # Create fake Sec10kExtractor
     class TestSec10kExtractor(Sec10kExtractor):
+        name: str = "test_extractor"
+
         def extract_filings(self, filing_metadata):
             pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata)
             return fake_extraction_metadata, fake_extracted

From 520e6d122cf8b06945108018730af75b18d9952e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 10:57:36 -0400
Subject: [PATCH 020/161] Prep for multiple code locations

---
 .../library/mlflow/__init__.py                |  24 +-
 src/mozilla_sec_eia/library/model_jobs.py     |  88 +++++
 src/mozilla_sec_eia/library/pipeline.py       |  93 -----
 src/mozilla_sec_eia/models/sec10k/__init__.py |  57 +++
 .../models/sec10k/basic_10k.py                |  85 ++++-
 .../models/sec10k/ex_21/__init__.py           | 172 +++++++++
 src/mozilla_sec_eia/models/sec10k/extract.py  |  75 +++-
 src/mozilla_sec_eia/models/sec10k/pipeline.py | 360 ------------------
 src/mozilla_sec_eia/pudl_pipelines.py         |  26 --
 .../pudl_validation_pipelines.py              |  34 --
 tests/unit/models/sec10k/extract_test.py      |   5 +-
 11 files changed, 491 insertions(+), 528 deletions(-)
 create mode 100644 src/mozilla_sec_eia/library/model_jobs.py
 delete mode 100644 src/mozilla_sec_eia/library/pipeline.py
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/pipeline.py
 delete mode 100644 src/mozilla_sec_eia/pudl_pipelines.py
 delete mode 100644 src/mozilla_sec_eia/pudl_validation_pipelines.py

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index c5e6642..5987f75 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -1,7 +1,5 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
-from dagster import EnvVar
-
 from .mlflow_io_managers import (
     MlflowBaseIOManager,
     MlflowMetricsIOManager,
@@ -12,14 +10,6 @@
     get_most_recent_run,
 )
 
-mlflow_production_interface = MlflowInterface(
-    experiment_name="",
-    tracking_uri=EnvVar("MLFLOW_TRACKING_URI"),
-    project=EnvVar("GCS_PROJECT"),
-    tracking_enabled=False,
-)
-mlflow_train_test_interface = MlflowInterface.configure_at_launch()
-
 
 def get_mlflow_io_manager(
     key: str,
@@ -40,3 +30,17 @@ def get_mlflow_io_manager(
         raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.")
 
     return io_manager
+
+
+mlflow_interface_resource = MlflowInterface.configure_at_launch()
+mlflow_validation_io_managers = {
+    "mlflow_metrics_io_manager": get_mlflow_io_manager(
+        "mlflow_metrics_io_manager",
+        mlflow_interface=mlflow_interface_resource,
+    ),
+    "mlflow_pandas_artifact_io_manager": get_mlflow_io_manager(
+        "mlflow_pandas_artifact_io_manager",
+        mlflow_interface=mlflow_interface_resource,
+        pandas_file_type="csv",
+    ),
+}
diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py
new file mode 100644
index 0000000..8bdfc5c
--- /dev/null
+++ b/src/mozilla_sec_eia/library/model_jobs.py
@@ -0,0 +1,88 @@
+"""Implement helper methods for constructing dagster jobs.
+
+Methods defined here are the main interface for constructing PUDL model jobs.
+`create_production_model_job` will produce a dagster job that will use the default
+multi-process executor to run a PUDL model. `create_validation_model_job` is meant for
+testing/validating models with an mlflow run backing the dagster run for logging.
+To avoid problems with mlflow runs, test/validation jobs are run with the dagster
+in process executor.
+"""
+
+import mlflow
+from dagster import (
+    AssetsDefinition,
+    HookContext,
+    JobDefinition,
+    define_asset_job,
+    failure_hook,
+    in_process_executor,
+    success_hook,
+)
+from mlflow.entities import RunStatus
+
+
+def create_production_model_job(
+    job_name: str,
+    assets: list[AssetsDefinition],
+    **kwargs,
+) -> JobDefinition:
+    """Construct a dagster job and supply Definitions with assets and resources."""
+    return define_asset_job(
+        job_name,
+        selection=assets,
+        config={
+            "ops": {},
+            "resources": {
+                "mlflow_interface": {
+                    "config": {
+                        "experiment_name": job_name,
+                        "tracking_enabled": False,
+                    }
+                }
+            },
+        },
+        **kwargs,
+    )
+
+
+@success_hook(required_resource_keys={"mlflow_interface"})
+def log_op_config(context: HookContext):
+    """Log any config supplied to ops/assets in validation job to mlflow tracking server."""
+    if context.op_config is not None:
+        mlflow.log_params(context.op_config)
+
+
+@failure_hook(required_resource_keys={"mlflow_interface"})
+def end_run_on_failure(context: HookContext):
+    """Inform mlflow about job failure."""
+    if isinstance(context.op_exception, KeyboardInterrupt):
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
+    else:
+        mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
+
+
+def create_validation_model_job(
+    job_name: str,
+    assets: list[AssetsDefinition],
+    **kwargs,
+):
+    """Construct a dagster job and supply Definitions with assets and resources."""
+    return define_asset_job(
+        job_name,
+        selection=assets,
+        executor_def=in_process_executor,
+        hooks={log_op_config, end_run_on_failure},
+        # Configure mlflow_interface for job with appropriate experiment name
+        config={
+            "ops": {},
+            "resources": {
+                "mlflow_interface": {
+                    "config": {
+                        "experiment_name": job_name,
+                        "tracking_enabled": True,
+                    }
+                }
+            },
+        },
+        **kwargs,
+    )
diff --git a/src/mozilla_sec_eia/library/pipeline.py b/src/mozilla_sec_eia/library/pipeline.py
deleted file mode 100644
index e35f1a9..0000000
--- a/src/mozilla_sec_eia/library/pipeline.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Implement helper methods for constructing dagster jobs.
-
-Methods defined here are the main interface for constructing PUDL model jobs.
-`create_production_pipeline` will produce a dagster job that will use the default
-multi-process executor to run a PUDL model. `create_validation_pipeline` is meant for
-testing/validating models with an mlflow run backing the dagster run for logging.
-"""
-
-import mlflow
-from dagster import (
-    AssetsDefinition,
-    HookContext,
-    ResourceDefinition,
-    define_asset_job,
-    failure_hook,
-    in_process_executor,
-    success_hook,
-)
-from mlflow.entities import RunStatus
-
-PUDL_PIPELINE_PRODUCTION_JOBS = []
-PUDL_PIPELINE_PRODUCTION_ASSETS = []
-PUDL_PIPELINE_PRODUCTION_RESOURCES = {}
-
-PUDL_PIPELINE_VALIDATION_JOBS = []
-PUDL_PIPELINE_VALIDATION_ASSETS = []
-PUDL_PIPELINE_VALIDATION_RESOURCES = {}
-
-
-def create_production_pipeline(
-    pipeline_name: str,
-    assets: list[AssetsDefinition],
-    resources: dict[str, ResourceDefinition],
-    **kwargs,
-):
-    """Construct a dagster job and supply Definitions with assets and resources."""
-    PUDL_PIPELINE_PRODUCTION_JOBS.append(
-        define_asset_job(
-            pipeline_name,
-            selection=assets,
-            **kwargs,
-        )
-    )
-    PUDL_PIPELINE_PRODUCTION_ASSETS.extend(assets)
-    PUDL_PIPELINE_PRODUCTION_RESOURCES.update(resources)
-
-
-@success_hook(required_resource_keys={"mlflow_interface"})
-def log_op_config(context: HookContext):
-    """Log any config supplied to ops/assets in validation job to mlflow tracking server."""
-    if context.op_config is not None:
-        mlflow.log_params(context.op_config)
-
-
-@failure_hook(required_resource_keys={"mlflow_interface"})
-def end_run_on_failure(context: HookContext):
-    """Inform mlflow about job failure."""
-    if isinstance(context.op_exception, KeyboardInterrupt):
-        mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED))
-    else:
-        mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED))
-
-
-def create_validation_pipeline(
-    pipeline_name: str,
-    assets: list[AssetsDefinition],
-    resources: dict[str, ResourceDefinition],
-    **kwargs,
-):
-    """Construct a dagster job and supply Definitions with assets and resources."""
-    PUDL_PIPELINE_VALIDATION_JOBS.append(
-        define_asset_job(
-            pipeline_name,
-            selection=assets,
-            executor_def=in_process_executor,
-            hooks={log_op_config, end_run_on_failure},
-            # Configure mlflow_interface for job with appropriate experiment name
-            config={
-                "ops": {},
-                "resources": {
-                    "mlflow_interface": {
-                        "config": {
-                            "experiment_name": pipeline_name,
-                            "tracking_enabled": True,
-                        }
-                    }
-                },
-            },
-            **kwargs,
-        )
-    )
-    PUDL_PIPELINE_VALIDATION_ASSETS.extend(assets)
-    PUDL_PIPELINE_VALIDATION_RESOURCES.update(resources)
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 001c6ad..1e2d56b 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -1 +1,58 @@
 """Implement models to extract data from SEC10k filings."""
+
+from dagster import (
+    Definitions,
+    load_assets_from_modules,
+    load_assets_from_package_module,
+)
+
+from mozilla_sec_eia.library import model_jobs
+from mozilla_sec_eia.library.mlflow import (
+    MlflowInterface,
+    mlflow_interface_resource,
+    mlflow_validation_io_managers,
+)
+
+from . import basic_10k, ex_21, extract
+from .utils.cloud import cloud_interface_resource
+
+basic_10k_assets = load_assets_from_modules([basic_10k])
+ex21_assets = load_assets_from_package_module(ex_21)
+shared_assets = load_assets_from_modules([extract])
+
+basic_10k_production_job = model_jobs.create_production_model_job(
+    "basic_10k_extraction",
+    basic_10k.production_assets,
+)
+
+basic_10k_validation_job = model_jobs.create_production_model_job(
+    "basic_10k_extraction_validation",
+    basic_10k.validation_assets,
+)
+
+
+ex21_production_job = model_jobs.create_production_model_job(
+    "ex21_extraction",
+    ex_21.production_assets,
+)
+
+ex21_validation_job = model_jobs.create_validation_model_job(
+    "ex21_extraction_validation",
+    ex_21.validation_assets,
+)
+
+
+defs = Definitions(
+    assets=basic_10k_assets + ex21_assets + shared_assets,
+    jobs=[
+        basic_10k_production_job,
+        basic_10k_validation_job,
+        ex21_production_job,
+        ex21_validation_job,
+    ],
+    resources={
+        "cloud_interface": cloud_interface_resource,
+        "mlflow_interface": mlflow_interface_resource,
+    }
+    | mlflow_validation_io_managers,
+)
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index df57ac5..a4c0230 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -3,9 +3,16 @@
 import logging
 
 import pandas as pd
+from dagster import AssetIn, asset
 
-from .extract import Sec10kExtractor
-from .utils.cloud import Sec10K
+from mozilla_sec_eia.library import validation_helpers
+
+from .extract import (
+    Sec10kExtractor,
+    sec10k_extraction_asset_factory,
+    sec10k_filing_metadata,
+)
+from .utils.cloud import GCSArchive, Sec10K, cloud_interface_resource
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -99,3 +106,77 @@ def extract_filings(
                 ["filename", "filer_count", "block", "block_count", "key"]
             ),
         )
+
+
+@asset
+def basic_10k_validation_set() -> pd.DataFrame:
+    """Return dataframe containing basic 10k validation data."""
+    return validation_helpers.load_validation_data(
+        "basic_10k_labels.csv",
+        index_cols=["filename", "filer_count", "block", "block_count", "key"],
+    )
+
+
+basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
+
+
+@asset(
+    ins={
+        basic_10k_extracted_validation_asset_name: AssetIn(
+            basic_10k_extracted_validation_asset_name
+        ),
+        "basic_10k_validation_set": AssetIn(),
+    },
+    io_manager_key="mlflow_metrics_io_manager",
+)
+def basic_10k_extraction_validation_metrics(**kwargs):
+    """Compute basic 10k extraction validation metrics."""
+    computed = kwargs[basic_10k_extracted_validation_asset_name]
+    validation = kwargs["basic_10k_validation_set"]
+
+    return validation_helpers.pandas_compute_precision_recall(
+        computed, validation, value_col="value"
+    )
+
+
+@asset
+def basic_10k_validation_filing_metadata(
+    cloud_interface: GCSArchive,
+    basic_10k_validation_set: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[
+        filing_metadata["filename"].isin(
+            basic_10k_validation_set.index.get_level_values("filename").unique()
+        )
+    ]
+
+
+basic_10k_extractor_resource = Basic10kExtractor(
+    cloud_interface=cloud_interface_resource
+)
+basic_10k_production_extraction = sec10k_extraction_asset_factory(
+    "basic_10k",
+    basic_10k_extractor_resource,
+    extraction_metadata_asset_name="basic_10k_extraction_metadata",
+    extracted_asset_name="basic_10k_company_info",
+)
+
+
+basic_10k_validation_extraction = sec10k_extraction_asset_factory(
+    "basic_10k_validation",
+    basic_10k_extractor_resource,
+    filing_metadata_asset_name="basic_10k_validation_filing_metadata",
+    extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
+    extracted_asset_name=basic_10k_extracted_validation_asset_name,
+)
+
+production_assets = [basic_10k_production_extraction, sec10k_filing_metadata]
+
+validation_assets = [
+    basic_10k_validation_extraction,
+    basic_10k_validation_set,
+    basic_10k_validation_filing_metadata,
+    basic_10k_extraction_validation_metrics,
+]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 549c348..52b8a9d 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1 +1,173 @@
 """Module for working with exhibit 21 data."""
+
+import pandas as pd
+from dagster import AssetIn, AssetOut, asset, multi_asset
+
+from mozilla_sec_eia.library import validation_helpers
+from mozilla_sec_eia.library.mlflow import mlflow_interface_resource
+
+from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata
+from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
+from ..utils.layoutlm import LayoutlmResource
+from .inference import Exhibit21Extractor, clean_extracted_df
+
+
+@asset
+def ex21_validation_set() -> pd.DataFrame:
+    """Return dataframe containing basic 10k validation data."""
+    return clean_ex21_validation_set(
+        validation_helpers.load_validation_data("ex21_labels.csv")
+    )
+
+
+@asset
+def ex21_validation_filing_metadata(
+    cloud_interface: GCSArchive,
+    ex21_validation_set: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[
+        filing_metadata["filename"].isin(
+            ex21_validation_set.index.get_level_values("filename").unique()
+        )
+    ]
+
+
+ex21_extracted_validation_asset_name = "ex21_validation"
+
+
+@multi_asset(
+    ins={
+        "computed_df": AssetIn(ex21_extracted_validation_asset_name),
+        "validation_df": AssetIn("ex21_validation_set"),
+    },
+    outs={
+        "ex21_jaccard_per_table": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_precision_recall_per_table": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_incorrect_filenames": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"),
+    },
+)
+def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
+    """Compute validation metrics for Ex. 21 extraction."""
+    shared_cols = validation_df.columns.intersection(computed_df.columns)
+    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
+    n_equal = 0
+    validation_filenames = validation_df["id"].unique()
+    n_files = len(validation_filenames)
+    table_metrics_dict = {}
+    jaccard_dict = {}
+    incorrect_files = []
+    # iterate through each file and check each extracted table
+    for filename in validation_filenames:
+        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
+            drop=True
+        )
+        validation_table_df = validation_df[
+            validation_df["id"] == filename
+        ].reset_index(drop=True)
+        # check if the tables are exactly equal
+        if extracted_table_df.equals(validation_table_df):
+            # TODO: strip llc and other company strings before comparison
+            n_equal += 1
+        else:
+            incorrect_files.append(filename)
+        # compute precision and recall for each column
+        table_metrics_dict[filename] = {}
+        jaccard_dict[filename] = {}
+        for col in ["subsidiary", "loc", "own_per"]:
+            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
+                extracted_table_df, validation_table_df, value_col=col
+            )
+            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
+                "precision"
+            ]
+            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
+            # get the jaccard similarity between columns
+            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
+                computed_df=extracted_table_df,
+                validation_df=validation_table_df,
+                value_col=col,
+            )
+
+    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
+    prec_recall_df = pd.DataFrame.from_dict(
+        table_metrics_dict, orient="index"
+    ).reset_index()
+
+    return (
+        jaccard_df,
+        prec_recall_df,
+        pd.DataFrame({"filename": incorrect_files}),
+        {
+            "table_accuracy": n_equal / n_files,
+            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
+            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
+            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
+            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
+            / n_files,
+            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
+            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
+            / n_files,
+            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
+            / n_files,
+            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
+            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
+        },
+    )
+
+
+def clean_ex21_validation_set(validation_df: pd.DataFrame):
+    """Clean Ex. 21 validation data to match extracted format."""
+    validation_df = validation_df.rename(
+        columns={
+            "Filename": "id",
+            "Subsidiary": "subsidiary",
+            "Location of Incorporation": "loc",
+            "Ownership Percentage": "own_per",
+        }
+    )
+    validation_df["own_per"] = validation_df["own_per"].astype(str)
+    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
+    validation_df = clean_extracted_df(validation_df)
+    return validation_df
+
+
+exhibit_21_extractor_resource = Exhibit21Extractor(
+    cloud_interface=cloud_interface_resource,
+    layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource),
+)
+ex21_production_extraction = sec10k_extraction_asset_factory(
+    "ex21",
+    exhibit_21_extractor_resource,
+    extraction_metadata_asset_name="ex21_extraction_metadata",
+    extracted_asset_name="ex21_company_info",
+)
+
+
+ex21_validation_extraction = sec10k_extraction_asset_factory(
+    "ex21_validation",
+    exhibit_21_extractor_resource,
+    filing_metadata_asset_name="ex21_validation_filing_metadata",
+    extraction_metadata_asset_name="ex21_extraction_validation_metadata",
+    extracted_asset_name=ex21_extracted_validation_asset_name,
+)
+
+production_assets = [
+    sec10k_filing_metadata,
+    ex21_production_extraction,
+]
+
+validation_assets = [
+    ex21_validation_set,
+    ex21_validation_filing_metadata,
+    ex21_validation_extraction,
+    ex21_validation_metrics,
+]
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 4461b49..8904b53 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -1,7 +1,16 @@
 """Implement base class for an SEC10k extractor."""
 
 import pandas as pd
-from dagster import ConfigurableResource
+from dagster import (
+    AssetExecutionContext,
+    AssetIn,
+    AssetOut,
+    ConfigurableResource,
+    StaticPartitionsDefinition,
+    asset,
+    multi_asset,
+    with_resources,
+)
 
 from .utils.cloud import GCSArchive
 
@@ -19,3 +28,67 @@ def extract_filings(
         raise NotImplementedError(
             "extract_filings must be implemented by any subclass!"
         )
+
+
+# Create year_quarter partitions
+year_quarter_partitions = StaticPartitionsDefinition(
+    [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)]
+)
+
+
+@asset(partitions_def=year_quarter_partitions)
+def sec10k_filing_metadata(
+    context: AssetExecutionContext,
+    cloud_interface: GCSArchive,
+) -> pd.DataFrame:
+    """Return filing metadata for year_quarter partition."""
+    year_quarter = context.partition_key
+    df = cloud_interface.get_metadata(year_quarter=year_quarter)
+    return df
+
+
+def sec10k_extraction_asset_factory(
+    name: str,
+    sec10k_extractor: Sec10kExtractor,
+    partitions_def=year_quarter_partitions,
+    filing_metadata_asset_name: str = "sec10k_filing_metadata",
+    extraction_metadata_asset_name: str = "extraction_metadata",
+    extracted_asset_name: str = "extraction_metadata",
+):
+    """Create asset to extract data from sec10k data.
+
+    Args:
+        name: Name of extraction asset.
+        sec10k_extractor: Subclass of Sec10kExtractor used to extract data.
+        partitions_def: Partitions for asset (production uses year_quarter parts,
+            validation is not partitioned.
+        filing_metadata_asset_name: Name of input asset with metadata of filings to
+            extract.
+        extraction_metadata_asset_name: Name of output asset containing metadata
+            from extraction run.
+        extracted_asset_name: Name of output asset containing extracted data.
+    """
+
+    @multi_asset(
+        name=name,
+        outs={
+            extraction_metadata_asset_name: AssetOut(),
+            extracted_asset_name: AssetOut(),
+        },
+        ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
+        partitions_def=partitions_def,
+        required_resource_keys={sec10k_extractor.name},
+    )
+    def extract_filings(
+        context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame
+    ) -> tuple[pd.DataFrame, pd.DataFrame]:
+        """Run Sec10kExtractor on selected partition and return."""
+        extractor = context.resources.original_resource_dict[sec10k_extractor.name]
+        extraction_metadata, extracted = extractor.extract_filings(
+            sec10k_filing_metadata
+        )
+        return extraction_metadata, extracted
+
+    return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[
+        0
+    ]
diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py
deleted file mode 100644
index 5604d6a..0000000
--- a/src/mozilla_sec_eia/models/sec10k/pipeline.py
+++ /dev/null
@@ -1,360 +0,0 @@
-"""Implement top level extraction methods and tooling."""
-
-import logging
-
-import pandas as pd
-import pandera as pa
-from dagster import (
-    AssetExecutionContext,
-    AssetIn,
-    AssetOut,
-    StaticPartitionsDefinition,
-    asset,
-    multi_asset,
-    with_resources,
-)
-
-from mozilla_sec_eia.library import validation_helpers
-from mozilla_sec_eia.library.mlflow import (
-    mlflow_production_interface,
-    mlflow_train_test_interface,
-)
-from mozilla_sec_eia.library.pipeline import (
-    create_production_pipeline,
-    create_validation_pipeline,
-)
-
-from .basic_10k import Basic10kExtractor
-from .ex_21.inference import Exhibit21Extractor, clean_extracted_df
-from .extract import Sec10kExtractor
-from .utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
-from .utils.layoutlm import LayoutlmResource
-
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-
-DATASETS = ["ex21", "basic_10k"]
-
-
-class ExtractionMetadataSchema(pa.DataFrameModel):
-    """Define the required schema for extraction metadata.
-
-    Extra columns are permitted, but these are required for computing extraction metrics.
-    """
-
-    filename: pa.typing.Index[str] = pa.Field(check_name=True)
-    success: bool = pa.Field(coerce=True)
-
-
-# Create year_quarter partitions
-partitions_def = StaticPartitionsDefinition(
-    [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)]
-)
-
-
-@asset(partitions_def=partitions_def)
-def sec10k_filing_metadata(
-    context: AssetExecutionContext,
-    cloud_interface: GCSArchive,
-) -> pd.DataFrame:
-    """Return filing metadata for year_quarter partition."""
-    year_quarter = context.partition_key
-    df = cloud_interface.get_metadata(year_quarter=year_quarter)
-    return df
-
-
-def sec10k_extraction_asset_factory(
-    name: str,
-    sec10k_extractor: Sec10kExtractor,
-    partitions_def=None,
-    filing_metadata_asset_name: str = "sec10k_filing_metadata",
-    extraction_metadata_asset_name: str = "extraction_metadata",
-    extracted_asset_name: str = "extraction_metadata",
-):
-    """Create asset to extract data from sec10k data.
-
-    Args:
-        name: Name of extraction asset.
-        sec10k_extractor: Subclass of Sec10kExtractor used to extract data.
-        partitions_def: Partitions for asset (production uses year_quarter parts,
-            validation is not partitioned.
-        filing_metadata_asset_name: Name of input asset with metadata of filings to
-            extract.
-        extraction_metadata_asset_name: Name of output asset containing metadata
-            from extraction run.
-        extracted_asset_name: Name of output asset containing extracted data.
-    """
-
-    @multi_asset(
-        name=name,
-        outs={
-            extraction_metadata_asset_name: AssetOut(),
-            extracted_asset_name: AssetOut(),
-        },
-        ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
-        partitions_def=partitions_def,
-        required_resource_keys={sec10k_extractor.name},
-    )
-    def extract_filings(
-        context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Run Sec10kExtractor on selected partition and return."""
-        extractor = context.resources.original_resource_dict[sec10k_extractor.name]
-        extraction_metadata, extracted = extractor.extract_filings(
-            sec10k_filing_metadata
-        )
-        return extraction_metadata, extracted
-
-    return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[
-        0
-    ]
-
-
-@asset
-def basic_10k_validation_set() -> pd.DataFrame:
-    """Return dataframe containing basic 10k validation data."""
-    return validation_helpers.load_validation_data(
-        "basic_10k_labels.csv",
-        index_cols=["filename", "filer_count", "block", "block_count", "key"],
-    )
-
-
-basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
-
-
-@asset(
-    ins={
-        basic_10k_extracted_validation_asset_name: AssetIn(
-            basic_10k_extracted_validation_asset_name
-        ),
-        "basic_10k_validation_set": AssetIn(),
-    },
-    io_manager_key="mlflow_metrics_io_manager",
-)
-def basic_10k_extraction_validation_metrics(**kwargs):
-    """Compute basic 10k extraction validation metrics."""
-    computed = kwargs[basic_10k_extracted_validation_asset_name]
-    validation = kwargs["basic_10k_validation_set"]
-
-    return validation_helpers.pandas_compute_precision_recall(
-        computed, validation, value_col="value"
-    )
-
-
-@asset
-def basic_10k_validation_filing_metadata(
-    cloud_interface: GCSArchive,
-    basic_10k_validation_set: pd.DataFrame,
-) -> pd.DataFrame:
-    """Get sec 10k filing metadata from validation set."""
-    filing_metadata = cloud_interface.get_metadata()
-    return filing_metadata[
-        filing_metadata["filename"].isin(
-            basic_10k_validation_set.index.get_level_values("filename").unique()
-        )
-    ]
-
-
-# Register basic 10k extraction pipeline
-create_production_pipeline(
-    "basic_10k_extraction",
-    [
-        sec10k_filing_metadata,
-        sec10k_extraction_asset_factory(
-            "basic_10k",
-            Basic10kExtractor(cloud_interface=cloud_interface_resource),
-            partitions_def=partitions_def,
-            extraction_metadata_asset_name="basic_10k_extraction_metadata",
-            extracted_asset_name="basic_10k_company_info",
-        ),
-    ],
-    resources={"cloud_interface": cloud_interface_resource},
-)
-
-
-# Register basic 10k extraction validation pipeline
-create_validation_pipeline(
-    "basic_10k_extraction",
-    [
-        basic_10k_validation_filing_metadata,
-        sec10k_extraction_asset_factory(
-            "basic_10k",
-            Basic10kExtractor(cloud_interface=cloud_interface_resource),
-            filing_metadata_asset_name="basic_10k_validation_filing_metadata",
-            extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
-            extracted_asset_name=basic_10k_extracted_validation_asset_name,
-        ),
-        basic_10k_validation_set,
-        basic_10k_extraction_validation_metrics,
-    ],
-    resources={"cloud_interface": cloud_interface_resource},
-)
-
-
-@asset
-def ex21_validation_set() -> pd.DataFrame:
-    """Return dataframe containing basic 10k validation data."""
-    return clean_ex21_validation_set(
-        validation_helpers.load_validation_data("ex21_labels.csv")
-    )
-
-
-@asset
-def ex21_validation_filing_metadata(
-    cloud_interface: GCSArchive,
-    ex21_validation_set: pd.DataFrame,
-) -> pd.DataFrame:
-    """Get sec 10k filing metadata from validation set."""
-    filing_metadata = cloud_interface.get_metadata()
-    return filing_metadata[
-        filing_metadata["filename"].isin(
-            ex21_validation_set.index.get_level_values("filename").unique()
-        )
-    ]
-
-
-ex21_extracted_validation_asset_name = "ex21_validation"
-
-
-@multi_asset(
-    ins={
-        "computed_df": AssetIn(ex21_extracted_validation_asset_name),
-        "validation_df": AssetIn("ex21_validation_set"),
-    },
-    outs={
-        "ex21_jaccard_per_table": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_precision_recall_per_table": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_incorrect_filenames": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"),
-    },
-)
-def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
-    """Compute validation metrics for Ex. 21 extraction."""
-    shared_cols = validation_df.columns.intersection(computed_df.columns)
-    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
-    n_equal = 0
-    validation_filenames = validation_df["id"].unique()
-    n_files = len(validation_filenames)
-    table_metrics_dict = {}
-    jaccard_dict = {}
-    incorrect_files = []
-    # iterate through each file and check each extracted table
-    for filename in validation_filenames:
-        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
-            drop=True
-        )
-        validation_table_df = validation_df[
-            validation_df["id"] == filename
-        ].reset_index(drop=True)
-        # check if the tables are exactly equal
-        if extracted_table_df.equals(validation_table_df):
-            # TODO: strip llc and other company strings before comparison
-            n_equal += 1
-        else:
-            incorrect_files.append(filename)
-        # compute precision and recall for each column
-        table_metrics_dict[filename] = {}
-        jaccard_dict[filename] = {}
-        for col in ["subsidiary", "loc", "own_per"]:
-            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
-                extracted_table_df, validation_table_df, value_col=col
-            )
-            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
-                "precision"
-            ]
-            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
-            # get the jaccard similarity between columns
-            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
-                computed_df=extracted_table_df,
-                validation_df=validation_table_df,
-                value_col=col,
-            )
-
-    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
-    prec_recall_df = pd.DataFrame.from_dict(
-        table_metrics_dict, orient="index"
-    ).reset_index()
-
-    return (
-        jaccard_df,
-        prec_recall_df,
-        pd.DataFrame({"filename": incorrect_files}),
-        {
-            "table_accuracy": n_equal / n_files,
-            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
-            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
-            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
-            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
-            / n_files,
-            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
-            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
-            / n_files,
-            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
-            / n_files,
-            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
-            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
-        },
-    )
-
-
-def clean_ex21_validation_set(validation_df: pd.DataFrame):
-    """Clean Ex. 21 validation data to match extracted format."""
-    validation_df = validation_df.rename(
-        columns={
-            "Filename": "id",
-            "Subsidiary": "subsidiary",
-            "Location of Incorporation": "loc",
-            "Ownership Percentage": "own_per",
-        }
-    )
-    validation_df["own_per"] = validation_df["own_per"].astype(str)
-    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
-    validation_df = clean_extracted_df(validation_df)
-    return validation_df
-
-
-# Register ex21 extraction pipeline
-create_production_pipeline(
-    "ex21_extraction",
-    [
-        sec10k_filing_metadata,
-        sec10k_extraction_asset_factory(
-            "ex21",
-            Exhibit21Extractor(
-                cloud_interface=cloud_interface_resource,
-                layoutlm=LayoutlmResource(mlflow_interface=mlflow_production_interface),
-            ),
-            partitions_def=partitions_def,
-            extraction_metadata_asset_name="ex21_extraction_metadata",
-            extracted_asset_name="ex21_company_info",
-        ),
-    ],
-    resources={"cloud_interface": cloud_interface_resource},
-)
-
-
-# Register ex21 extraction validation pipeline
-create_validation_pipeline(
-    "ex21_extraction",
-    [
-        ex21_validation_filing_metadata,
-        sec10k_extraction_asset_factory(
-            "ex21",
-            Exhibit21Extractor(
-                cloud_interface=cloud_interface_resource,
-                layoutlm=LayoutlmResource(mlflow_interface=mlflow_train_test_interface),
-            ),
-            filing_metadata_asset_name="ex21_validation_filing_metadata",
-            extraction_metadata_asset_name="ex21_extraction_validation_metadata",
-            extracted_asset_name=ex21_extracted_validation_asset_name,
-        ),
-        ex21_validation_set,
-        ex21_validation_metrics,
-    ],
-    resources={"cloud_interface": cloud_interface_resource},
-)
diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py
deleted file mode 100644
index 57b3808..0000000
--- a/src/mozilla_sec_eia/pudl_pipelines.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Define production pipelines for running PUDL models."""
-
-import logging
-
-import coloredlogs
-from dagster import Definitions
-
-from mozilla_sec_eia.library.mlflow import mlflow_production_interface
-from mozilla_sec_eia.library.pipeline import (
-    PUDL_PIPELINE_PRODUCTION_ASSETS,
-    PUDL_PIPELINE_PRODUCTION_JOBS,
-    PUDL_PIPELINE_PRODUCTION_RESOURCES,
-)
-
-logger = logging.getLogger("catalystcoop")
-log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
-coloredlogs.install(fmt=log_format, logger=logger)
-
-production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES
-
-production_pipelines = Definitions(
-    assets=PUDL_PIPELINE_PRODUCTION_ASSETS,
-    jobs=PUDL_PIPELINE_PRODUCTION_JOBS,
-    resources=production_io_resources
-    | {"mlflow_interface": mlflow_production_interface},
-)
diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py
deleted file mode 100644
index 63e9049..0000000
--- a/src/mozilla_sec_eia/pudl_validation_pipelines.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Define jobs to test/validate PUDL models."""
-
-import logging
-
-import coloredlogs
-from dagster import Definitions
-
-from mozilla_sec_eia.library.mlflow import (
-    get_mlflow_io_manager,
-    mlflow_train_test_interface,
-)
-from mozilla_sec_eia.library.pipeline import (
-    PUDL_PIPELINE_VALIDATION_ASSETS,
-    PUDL_PIPELINE_VALIDATION_JOBS,
-    PUDL_PIPELINE_VALIDATION_RESOURCES,
-)
-
-logger = logging.getLogger("catalystcoop")
-log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
-coloredlogs.install(fmt=log_format, logger=logger)
-
-validation_io_resources = {
-    key: get_mlflow_io_manager(
-        key, mlflow_interface=mlflow_train_test_interface, pandas_file_type="csv"
-    )
-    for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"]
-} | PUDL_PIPELINE_VALIDATION_RESOURCES
-
-validation_pipelines = Definitions(
-    assets=PUDL_PIPELINE_VALIDATION_ASSETS,
-    jobs=PUDL_PIPELINE_VALIDATION_JOBS,
-    resources=validation_io_resources
-    | {"mlflow_interface": mlflow_train_test_interface},
-)
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index c1cdac0..a413767 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -5,8 +5,8 @@
 
 import pandas as pd
 from dagster import asset, build_asset_context, materialize
-from mozilla_sec_eia.models.sec10k.extract import Sec10kExtractor
-from mozilla_sec_eia.models.sec10k.pipeline import (
+from mozilla_sec_eia.models.sec10k.extract import (
+    Sec10kExtractor,
     sec10k_extraction_asset_factory,
     sec10k_filing_metadata,
 )
@@ -71,6 +71,7 @@ def fake_filing_metadata_asset():
         filing_metadata_asset_name="fake_filing_metadata_asset",
         extracted_asset_name="test_sec10k_extraction",
         extraction_metadata_asset_name="test_sec10k_extraction_metadata",
+        partitions_def=None,
     )
 
     # Run assets and review results

From e99ee1ae6063cb976191a2937e90ac10f61d64ca Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 11:04:58 -0400
Subject: [PATCH 021/161] Add top-level worksapce file

---
 workspace.yaml | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 workspace.yaml

diff --git a/workspace.yaml b/workspace.yaml
new file mode 100644
index 0000000..144aada
--- /dev/null
+++ b/workspace.yaml
@@ -0,0 +1,2 @@
+load_from:
+  - python_module: mozilla_sec_eia.models.sec10k

From 559c0e6c813733f8a57cf9242fc3ac1e22680bf2 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 12:55:14 -0400
Subject: [PATCH 022/161] Restructure docs

---
 README.rst                                   | 155 ++++++++-----------
 src/mozilla_sec_eia/models/sec10k/README.rst |  99 ++++++++++++
 2 files changed, 160 insertions(+), 94 deletions(-)
 create mode 100644 src/mozilla_sec_eia/models/sec10k/README.rst

diff --git a/README.rst b/README.rst
index 6516ee4..d036e6c 100644
--- a/README.rst
+++ b/README.rst
@@ -1,4 +1,4 @@
-mozilla-sec-eia: Developing a linkage between SEC and EIA
+pudl-models: ML models developed for PUDL
 =======================================================================================
 
 .. readme-intro
@@ -27,100 +27,67 @@ mozilla-sec-eia: Developing a linkage between SEC and EIA
    :target: https://github.com/psf/black>
    :alt: Any color you want, so long as it's black.
 
-This repo contains exploratory development for an SEC-EIA linkage.
-
-Usage
+About
 -----
-
-CLI
-^^^
-The CLI uses a sub-command structure, so new commands and workflows can easily be
-added during development. It's usage is as following:
-
-``mozilla_dev {COMMAND} {OPTIONS}``
-
-The available commands are ``validate_archive``, which validates that all filings on
-the GCS archive align with those described in the metadata DB, ``finetune_ex21``,
-which will finetune the exhibit 21 extractor and log the model using mlflow, and
-``rename_filings``, which will rename labeled filings on GCS.
-
-Experiment/Model Tracking
-^^^^^^^^^^^^^^^^^^^^^^^^^
-We've setup a remote tracking server using `mlflow <https://mlflow.org/docs/latest/tracking.html>`_
-to manage tracking, caching, and versioning models developed as a part of this project.
-To interact with the server through the UI, go `here <https://mlflow-ned2up6sra-uc.a.run.app>`_
-and login using the username and password stored in gcloud secret manager.
-There is currently a finetuned layoutlm model for extracting exhibit 21 data stored
-on the server. This model can be accessed using the method
-``src/mozilla_sec_eia/utils/cloud.py:load_model``. This will return a dictionary
-containing ``model`` and ``tokenizer`` fields.
-
-Helper Tools
-^^^^^^^^^^^^
-Utility functions for accessing and working with 10k filings as well as their exhibit
-21 attachments can be found in 'src/mozilla_sec_eia/utils/cloud.py'. The base class is
-the ``GCSArchive`` which provides an interface to archived filings on GCS. To
-instantiate this class, the following environment variables need to be set, or defined
-in a ``.env`` file:
-
-``GCS_BUCKET_NAME``
-``GCS_METADATA_DB_INSTANCE_CONNECTION``
-``GCS_IAM_USER``
-``GCS_METADATA_DB_NAME``
-``GCS_PROJECT``
-``MLFLOW_TRACKING_URI``
-
-This code sample shows how to use the class to fetch filings from the archive:
-
-.. code-block:: python
-
-   from mozilla_sec_eia.utils.cloud import GCSArchive
-   archive = GCSArchive()
-
-   # Get metadata from postgres instance
-   metadata_df = archive.get_metadata()
-
-   # Do some filtering to get filings of interest
-   filings = metadata_df.loc[...  # Get rows from original df
-
-   # This will download and cache filings locally for later use
-   # Successive calls to get_filings will not re-download filings which are already cahced
-   downloaded_filings = archive.get_filings(filings)
-
-   # Get exhibit 21's and extract subsidiary data
-   for filing in downloaded_filings:
-           cool_extraction_model(filing.get_ex_21().as_image())
-
-Labeling
---------
-We are using `Label Studio <https://labelstud.io/>`_ to create training data
-for fine-tuning the Ex. 21 extraction model. The very preliminary workflow
-for labeling data is as follows:
-
-* For each filing that you want to label, follow notebook 7 to create the
-  inputs for Label Studio. This notebook first creates a PDF of the filing.
-  Then, it extracts the bounding boxes around each word and create a "task"
-  JSON and image for each Ex. 21 table that will be used in Label Studio.
-* Upload these JSONs and images to the same bucket in GCS (the "unlabeled"
-  bucket by default).
-* `Install Label Studio <https://labelstud.io/guide/install>`_
-* Start Label Studio locally and create a project.
-* Under Settings, set the template/config for the project with the config
-  found in ``labeling-configs/labeling-config.xml``. This should create the
-  correct entity labels and UI setup.
-* Connect GCS to Label Studio by following `these directions
-  <https://labelstud.io/guide/storage#Google-Cloud-Storage>`_
-* Specific Label Studio settings: Filter files for only JSONs
-  (these are your tasks). Leave "Treat every bucket object as a source file"
-  disabled. Add the service account authentication JSON for your bucket.
-* Additionally add a Target Storage bucket (the "labeled" bucket by
-  default).
-* Import data and label Ex. 21 tables.
-* Sync with target storage.
-* Update the ``labeled_data_tracking.csv`` with the new filings you've
-  labeled.
-* Run the ``rename_labeled_filings.py`` script to update labeled file
-  names in the GCS bucket with their SEC filename.
+The `PUDL <https://github.com/catalyst-cooperative/pudl>`__ project makes US energy data free and open
+for all. For more information, see the PUDL repo and `website <https://catalyst.coop/pudl/>`__.
+
+This repo implements machine learning models which support PUDL. The types of
+modelling performed here include record linkage between datasets, and extracting
+structured data from unstructured documents. The outputs of these models then feed
+into PUDL tables, and are distributed in the PUDL data warehouse.
+
+Project Structure
+-----------------
+This repo is split into two main sections, with shared tooling being implemented in
+``src/mozilla_sec_eia/library`` and actual models implemented in
+``src/mozilla_sec_eia/models``.
+
+Models
+^^^^^^
+Each model is contained in its own Dagster
+`code location <https://docs.dagster.io/concepts/code-locations>`__. This keeps models
+isolated from each other, allowing finetuned dependency management, and provides useful
+organization in the Dagster UI. To add a new model, you must create a new python module
+in the ``src/mozilla_sec_eia/models/`` directory. This module should define a single
+Dagster ``Definitions`` object which can be imported from the top-level of the module.
+For reference on how to structure a code location, see
+``src/mozilla_sec_eia/models/sec10k/`` for an example. After creating a new model,
+it must be added to
+`workspace.yaml <https://docs.dagster.io/concepts/code-locations/workspace-files>`__.
+
+There are three types of dagster `jobs <https://docs.dagster.io/concepts/assets/asset-jobs>`__
+expected in a model code location:
+
+* **Production Jobs**: Production jobs define a pipeline to execute a model and produce
+  outputs which typicall feed into PUDL.
+* **Validation Jobs**: Validation jobs are used to test/validate models. They will be
+  run in a single process with an
+  `mlflow <https://mlflow.org/docs/latest/tracking.html>`__ run backing
+  them to allow logging results to a tracking server.
+* **Training Jobs**: Training jobs are meant to train models and log results with
+  mlflow for use in production jobs.
+
+There are helper functions in ``src/mozilla_sec_eia/library/model_jobs.py`` for
+constructing each of these jobs. These functions help to ensure each job will
+use the appropriate executor and supply the job with necessary resources.
+
+Library
+^^^^^^^
+There's generic shared tooling for ``pudl-models`` defined in
+``src/mozilla_sec_eia/library/``. This includes the helper fucntions for
+constructing dagster jobs discussed above, as well as useful methods for computing
+validation metrics, and an interface to our mlflow tracking server integrated with
+our tracking server.
+
+MlFlow
+""""""
+We use a remote `mlflow tracking <https://mlflow.org/docs/latest/tracking.html>`__ to aide in the
+development and management of ``pudl-models``. In the ``mlflow`` module, there are
+several dagster resources and IO-managers that can be used in any models to allow simple
+seamless interface to the server.
+
+.. TODO: Add mlflow resource/io-manager examples
 
 
 About Catalyst Cooperative
diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst
new file mode 100644
index 0000000..ffecf28
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/README.rst
@@ -0,0 +1,99 @@
+sec10k: Extracting company ownership data from sec10k documents
+=======================================================================================
+
+This repo contains exploratory development for an SEC-EIA linkage.
+
+Usage
+-----
+
+Helper Tools
+^^^^^^^^^^^^
+Utility functions for accessing and working with 10k filings as well as their exhibit
+21 attachments can be found in 'src/mozilla_sec_eia/utils/cloud.py'. The base class is
+the ``GCSArchive`` which provides an interface to archived filings on GCS. To
+instantiate this class, the following environment variables need to be set, or defined
+in a ``.env`` file:
+
+``GCS_BUCKET_NAME``
+``GCS_METADATA_DB_INSTANCE_CONNECTION``
+``GCS_IAM_USER``
+``GCS_METADATA_DB_NAME``
+``GCS_PROJECT``
+``MLFLOW_TRACKING_URI``
+
+This code sample shows how to use the class to fetch filings from the archive:
+
+.. code-block:: python
+
+   from mozilla_sec_eia.utils.cloud import GCSArchive
+   archive = GCSArchive()
+
+   # Get metadata from postgres instance
+   metadata_df = archive.get_metadata()
+
+   # Do some filtering to get filings of interest
+   filings = metadata_df.loc[...  # Get rows from original df
+
+   # This will download and cache filings locally for later use
+   # Successive calls to get_filings will not re-download filings which are already cahced
+   downloaded_filings = archive.get_filings(filings)
+
+   # Get exhibit 21's and extract subsidiary data
+   for filing in downloaded_filings:
+           cool_extraction_model(filing.get_ex_21().as_image())
+
+Labeling
+--------
+We are using `Label Studio <https://labelstud.io/>`_ to create training data
+for fine-tuning the Ex. 21 extraction model. The very preliminary workflow
+for labeling data is as follows:
+
+* For each filing that you want to label, follow notebook 7 to create the
+  inputs for Label Studio. This notebook first creates a PDF of the filing.
+  Then, it extracts the bounding boxes around each word and create a "task"
+  JSON and image for each Ex. 21 table that will be used in Label Studio.
+* Upload these JSONs and images to the same bucket in GCS (the "unlabeled"
+  bucket by default).
+* `Install Label Studio <https://labelstud.io/guide/install>`_
+* Start Label Studio locally and create a project.
+* Under Settings, set the template/config for the project with the config
+  found in ``labeling-configs/labeling-config.xml``. This should create the
+  correct entity labels and UI setup.
+* Connect GCS to Label Studio by following `these directions
+  <https://labelstud.io/guide/storage#Google-Cloud-Storage>`_
+* Specific Label Studio settings: Filter files for only JSONs
+  (these are your tasks). Leave "Treat every bucket object as a source file"
+  disabled. Add the service account authentication JSON for your bucket.
+* Additionally add a Target Storage bucket (the "labeled" bucket by
+  default).
+* Import data and label Ex. 21 tables.
+* Sync with target storage.
+* Update the ``labeled_data_tracking.csv`` with the new filings you've
+  labeled.
+* Run the ``rename_labeled_filings.py`` script to update labeled file
+  names in the GCS bucket with their SEC filename.
+
+
+About Catalyst Cooperative
+---------------------------------------------------------------------------------------
+`Catalyst Cooperative <https://catalyst.coop>`__ is a small group of data
+wranglers and policy wonks organized as a worker-owned cooperative consultancy.
+Our goal is a more just, livable, and sustainable world. We integrate public
+data and perform custom analyses to inform public policy (`Hire us!
+<https://catalyst.coop/hire-catalyst>`__). Our focus is primarily on mitigating
+climate change and improving electric utility regulation in the United States.
+
+Contact Us
+^^^^^^^^^^
+* For general support, questions, or other conversations around the project
+  that might be of interest to others, check out the
+  `GitHub Discussions <https://github.com/catalyst-cooperative/pudl/discussions>`__
+* If you'd like to get occasional updates about our projects
+  `sign up for our email list <https://catalyst.coop/updates/>`__.
+* Want to schedule a time to chat with us one-on-one? Join us for
+  `Office Hours <https://calend.ly/catalyst-cooperative/pudl-office-hours>`__
+* Follow us on Twitter: `@CatalystCoop <https://twitter.com/CatalystCoop>`__
+* More info on our website: https://catalyst.coop
+* For private communication about the project or to hire us to provide customized data
+  extraction and analysis, you can email the maintainers:
+  `pudl@catalyst.coop <mailto:pudl@catalyst.coop>`__

From 93d02f3aac6aebb3486221eb4280a143e186864d Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 16:14:55 -0400
Subject: [PATCH 023/161] Add train model job

---
 .../library/mlflow/__init__.py                |  2 +-
 src/mozilla_sec_eia/library/model_jobs.py     | 10 ++++++++++
 src/mozilla_sec_eia/models/sec10k/__init__.py | 20 +++++++++++++++----
 .../models/sec10k/basic_10k.py                |  2 ++
 .../models/sec10k/ex_21/__init__.py           |  3 ++-
 .../models/sec10k/ex_21/train_extractor.py    | 14 ++++++-------
 src/mozilla_sec_eia/models/sec10k/extract.py  | 13 ++++++------
 tests/unit/models/sec10k/extract_test.py      |  8 ++++++--
 8 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index 5987f75..f07997a 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -33,7 +33,7 @@ def get_mlflow_io_manager(
 
 
 mlflow_interface_resource = MlflowInterface.configure_at_launch()
-mlflow_validation_io_managers = {
+mlflow_train_test_io_managers = {
     "mlflow_metrics_io_manager": get_mlflow_io_manager(
         "mlflow_metrics_io_manager",
         mlflow_interface=mlflow_interface_resource,
diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py
index 8bdfc5c..45602b4 100644
--- a/src/mozilla_sec_eia/library/model_jobs.py
+++ b/src/mozilla_sec_eia/library/model_jobs.py
@@ -86,3 +86,13 @@ def create_validation_model_job(
         },
         **kwargs,
     )
+
+
+def create_training_job(
+    job_name: str,
+    assets: list[AssetsDefinition],
+    **kwargs,
+):
+    """Construct a dagster job meant to train a model and log with mlflow."""
+    # For now training job config is the same as validation
+    return create_validation_model_job(job_name, assets, **kwargs)
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 1e2d56b..7abbb4e 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -10,14 +10,16 @@
 from mozilla_sec_eia.library.mlflow import (
     MlflowInterface,
     mlflow_interface_resource,
-    mlflow_validation_io_managers,
+    mlflow_train_test_io_managers,
 )
 
 from . import basic_10k, ex_21, extract
 from .utils.cloud import cloud_interface_resource
+from .utils.layoutlm import LayoutlmIOManager
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
+layoutlm_assets = load_assets_from_modules([ex_21.train_extractor])
 shared_assets = load_assets_from_modules([extract])
 
 basic_10k_production_job = model_jobs.create_production_model_job(
@@ -25,7 +27,7 @@
     basic_10k.production_assets,
 )
 
-basic_10k_validation_job = model_jobs.create_production_model_job(
+basic_10k_validation_job = model_jobs.create_validation_model_job(
     "basic_10k_extraction_validation",
     basic_10k.validation_assets,
 )
@@ -41,18 +43,28 @@
     ex_21.validation_assets,
 )
 
+layoutlm_finetune_job = model_jobs.create_training_job(
+    "layoutlm_finetune",
+    layoutlm_assets,
+)
+
 
 defs = Definitions(
-    assets=basic_10k_assets + ex21_assets + shared_assets,
+    assets=basic_10k_assets + ex21_assets + shared_assets + layoutlm_assets,
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
         ex21_production_job,
         ex21_validation_job,
+        layoutlm_finetune_job,
     ],
     resources={
         "cloud_interface": cloud_interface_resource,
         "mlflow_interface": mlflow_interface_resource,
+        "layoutlm_io_manager": LayoutlmIOManager(
+            mlflow_interface=mlflow_interface_resource
+        ),
     }
-    | mlflow_validation_io_managers,
+    | mlflow_train_test_io_managers
+    | extract.SEC10k_EXTRACTOR_RESOURCES,
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index a4c0230..4c943c6 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -170,6 +170,8 @@ def basic_10k_validation_filing_metadata(
     filing_metadata_asset_name="basic_10k_validation_filing_metadata",
     extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
     extracted_asset_name=basic_10k_extracted_validation_asset_name,
+    partitions_def=None,
+    io_manager_key="mlflow_pandas_artifact_io_manager",
 )
 
 production_assets = [basic_10k_production_extraction, sec10k_filing_metadata]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 52b8a9d..11817c4 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -148,7 +148,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame):
     "ex21",
     exhibit_21_extractor_resource,
     extraction_metadata_asset_name="ex21_extraction_metadata",
-    extracted_asset_name="ex21_company_info",
+    extracted_asset_name="ex21_company_ownership_info",
 )
 
 
@@ -158,6 +158,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame):
     filing_metadata_asset_name="ex21_validation_filing_metadata",
     extraction_metadata_asset_name="ex21_extraction_validation_metadata",
     extracted_asset_name=ex21_extracted_validation_asset_name,
+    partitions_def=None,
 )
 
 production_assets = [
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
index 00c80f3..cb37619 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
@@ -7,9 +7,8 @@
 
 from pathlib import Path
 
-import mlflow
 import numpy as np
-from dagster import Config
+from dagster import Config, asset
 from datasets import (
     Array2D,
     Array3D,
@@ -137,15 +136,15 @@ def load_test_train_set(
 class FineTuneConfig(Config):
     """Configuration to supply to `train_model`."""
 
-    labeled_json_path: str
+    labeled_json_path: str = "sec10k_filings/labeled_jsons/"
     gcs_training_data_dir: str = "labeled"
     output_dir: str = "layoutlm_trainer"
     test_size: float = 0.2
 
 
-def train_model(
+@asset(io_manager_key="layoutlm_io_manager")
+def layoutlm(
     config: FineTuneConfig,
-    layoutlm_mlflow_interface,
 ):
     """Train LayoutLM model with labeled data."""
     # Prepare model
@@ -189,6 +188,5 @@ def train_model(
     )
 
     # Train inside mlflow run. Mlflow will automatically handle logging training metrcis
-    with mlflow.start_run():
-        trainer.train()
-        # log_model(trainer)
+    trainer.train()
+    return trainer
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 8904b53..9db88bb 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -9,11 +9,12 @@
     StaticPartitionsDefinition,
     asset,
     multi_asset,
-    with_resources,
 )
 
 from .utils.cloud import GCSArchive
 
+SEC10k_EXTRACTOR_RESOURCES = {}
+
 
 class Sec10kExtractor(ConfigurableResource):
     """Base class for extracting SEC 10k data."""
@@ -54,6 +55,7 @@ def sec10k_extraction_asset_factory(
     filing_metadata_asset_name: str = "sec10k_filing_metadata",
     extraction_metadata_asset_name: str = "extraction_metadata",
     extracted_asset_name: str = "extraction_metadata",
+    io_manager_key: str | None = None,
 ):
     """Create asset to extract data from sec10k data.
 
@@ -72,8 +74,8 @@ def sec10k_extraction_asset_factory(
     @multi_asset(
         name=name,
         outs={
-            extraction_metadata_asset_name: AssetOut(),
-            extracted_asset_name: AssetOut(),
+            extraction_metadata_asset_name: AssetOut(io_manager_key=io_manager_key),
+            extracted_asset_name: AssetOut(io_manager_key=io_manager_key),
         },
         ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
         partitions_def=partitions_def,
@@ -89,6 +91,5 @@ def extract_filings(
         )
         return extraction_metadata, extracted
 
-    return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[
-        0
-    ]
+    SEC10k_EXTRACTOR_RESOURCES[sec10k_extractor.name] = sec10k_extractor
+    return extract_filings
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index a413767..82efac4 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -65,9 +65,10 @@ def fake_filing_metadata_asset():
         return fake_filing_metadata
 
     # Create fake extraction asset with configured inputs
+    test_extractor = TestSec10kExtractor(cloud_interface=FakeArchive())
     extraction_multi_asset = sec10k_extraction_asset_factory(
         name="test_sec10k_extraction",
-        sec10k_extractor=TestSec10kExtractor(cloud_interface=FakeArchive()),
+        sec10k_extractor=test_extractor,
         filing_metadata_asset_name="fake_filing_metadata_asset",
         extracted_asset_name="test_sec10k_extraction",
         extraction_metadata_asset_name="test_sec10k_extraction_metadata",
@@ -75,7 +76,10 @@ def fake_filing_metadata_asset():
     )
 
     # Run assets and review results
-    result = materialize([fake_filing_metadata_asset, extraction_multi_asset])
+    result = materialize(
+        [fake_filing_metadata_asset, extraction_multi_asset],
+        resources={test_extractor.name: test_extractor},
+    )
     pd.testing.assert_frame_equal(
         result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata
     )

From 5190bf99d8b634cfe164d9d51db71b54cbe8a3d3 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 16:24:29 -0400
Subject: [PATCH 024/161] Log mlflow artifacts as parquet until csv is fixed

---
 src/mozilla_sec_eia/library/mlflow/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index f07997a..a7a65d6 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -41,6 +41,5 @@ def get_mlflow_io_manager(
     "mlflow_pandas_artifact_io_manager": get_mlflow_io_manager(
         "mlflow_pandas_artifact_io_manager",
         mlflow_interface=mlflow_interface_resource,
-        pandas_file_type="csv",
     ),
 }

From ca9599e70e0ef9cc95e40125e06e17ee5c2ab619 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 3 Sep 2024 21:07:52 -0400
Subject: [PATCH 025/161] Fix ex21 extraction

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 56648eb..235b9c3 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -251,8 +251,14 @@ def extract_filings(
                 that is the filename of the extracted Ex. 21. Dataframe contains columns id,
                 subsidiary, loc, own_per.
         """
+        filings_with_ex21 = filing_metadata[
+            ~filing_metadata["exhibit_21_version"].isna()
+        ]
+        self.cloud_interface.get_filings(
+            filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
+        )
         dataset = create_inference_dataset(
-            pdfs_dir=self._pdf_dir,
+            pdfs_dir=Path(self._pdf_dir),
             labeled_json_dir=self._labeled_json_dir,
             has_labels=self.has_labels,
         )
@@ -286,7 +292,7 @@ def extract_filings(
         all_output_df = clean_extracted_df(all_output_df)
         all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
         all_output_df = all_output_df.reset_index(drop=True)
-        return logits, predictions, all_output_df, extraction_metadata
+        return extraction_metadata, all_output_df
 
 
 class LayoutLMInferencePipeline(Pipeline):

From 7e7a50319fc1365ba1ea4f948ea536d4f11d968d Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 09:28:05 -0400
Subject: [PATCH 026/161] Add development section to docs

---
 README.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.rst b/README.rst
index d036e6c..c85c013 100644
--- a/README.rst
+++ b/README.rst
@@ -89,6 +89,12 @@ seamless interface to the server.
 
 .. TODO: Add mlflow resource/io-manager examples
 
+Development
+-----------
+To launch the dagster UI to load all ``pudl-models``, run the command ``dagster dev``
+in the top-level of this repo. This will load the file ``worspace.yaml``, which points
+to each model. You can also work on a single model in isolation by running the command:
+``dagster dev -m mozilla_sec_eia.models.{your_cool_model}``.
 
 About Catalyst Cooperative
 ---------------------------------------------------------------------------------------

From 61f48c36bdc92fd1def79e18fa667b33ed093108 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 11:56:57 -0400
Subject: [PATCH 027/161] Fix integration tests

---
 .../library/mlflow/mlflow_resource.py         |  19 ++-
 .../models/sec10k/ex_21/__init__.py           |   4 +-
 .../integration/models/sec10k/extract_test.py | 141 +++---------------
 3 files changed, 29 insertions(+), 135 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
index 35dbf26..1060b9b 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
@@ -77,17 +77,20 @@ def mlflow_run_id(self) -> str | None:
 
     def _get_tracking_password(self, version_id: str = "latest"):
         """Get tracking server password from gcloud secrets."""
-        # Create the Secret Manager client.
-        client = secretmanager.SecretManagerServiceClient()
+        # Password not required for local use
+        if "sqlite" not in self.tracking_uri:
+            # Create the Secret Manager client.
+            client = secretmanager.SecretManagerServiceClient()
 
-        # Build the resource name of the secret version.
-        name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}"
+            # Build the resource name of the secret version.
+            name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}"
 
-        # Access the secret version.
-        response = client.access_secret_version(name=name)
+            # Access the secret version.
+            response = client.access_secret_version(name=name)
 
-        # Return the decoded payload.
-        return response.payload.data.decode("UTF-8")
+            # Return the decoded payload.
+            return response.payload.data.decode("UTF-8")
+        return ""
 
     def _configure_mlflow(self):
         """Do runtime configuration of mlflow."""
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 11817c4..39cd2d6 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -28,9 +28,7 @@ def ex21_validation_filing_metadata(
     """Get sec 10k filing metadata from validation set."""
     filing_metadata = cloud_interface.get_metadata()
     return filing_metadata[
-        filing_metadata["filename"].isin(
-            ex21_validation_set.index.get_level_values("filename").unique()
-        )
+        filing_metadata["filename"].isin(ex21_validation_set["filename"].unique())
     ]
 
 
diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index 5da0bb5..dce8577 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -1,143 +1,36 @@
 """Validate basic 10k and exhibit 21 extraction."""
 
 import logging
-import unittest
 
 import dotenv
-import numpy as np
-import pandas as pd
-import pytest
-from dagster import EnvVar, build_asset_context
-from mozilla_sec_eia.ex_21.inference import (
-    clean_extracted_df,
-    create_inference_dataset,
-    perform_inference,
-)
-from mozilla_sec_eia.extract import (
-    _get_most_recent_run,
-    basic_10k_validate,
-    ex21_validate,
-)
-from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface
-from mozilla_sec_eia.utils.layoutlm import load_model
-from pandas.testing import assert_frame_equal
+from mozilla_sec_eia.models import sec10k
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-def test_basic_10k_extraction():
-    """Run full 10k extraction on validation set and verify desired metrics are met."""
+def test_basic_10k_validation(
+    test_tracker_factory,
+    get_most_recent_mlflow_run_factory,
+):
+    """Test basic_10k_validation_job."""
     dotenv.load_dotenv()
-    experiment_name = "basic_10k_validate_test"
-    cloud_interface = GCSArchive(
-        filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
-        labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
-        metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
-        user=EnvVar("GCS_IAM_USER"),
-        metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
-        project=EnvVar("GCS_PROJECT"),
-    )
+    sec10k.defs.get_job_def("basic_10k_extraction_validation").execute_in_process()
 
-    with build_asset_context(
-        resources={
-            "basic_10k_extract_validate_mlflow": MlflowInterface(
-                experiment_name=experiment_name,
-                continue_run=False,
-                tracking_uri="sqlite:///:memory:",
-                cloud_interface=cloud_interface,
-            ),
-            "cloud_interface": cloud_interface,
-        }
-    ) as context:
-        basic_10k_validate(context)
-    run = _get_most_recent_run(experiment_name)
+    run = get_most_recent_mlflow_run_factory("basic_10k_extraction_validation")
 
     assert run.data.metrics["precision"] == 1
     assert run.data.metrics["recall"] == 1
 
 
-@pytest.mark.xfail
-def test_ex21_validation(test_mlflow_init_func):
-    """Run full Ex. 21 extraction on validation set and verify metrics are met."""
+def test_ex21_validation(
+    test_tracker_factory,
+    get_most_recent_mlflow_run_factory,
+):
+    """Test ex21_validation_job."""
     dotenv.load_dotenv()
-    experiment_name = "ex21_validate_test"
-    cloud_interface = GCSArchive(
-        filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
-        labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
-        metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
-        user=EnvVar("GCS_IAM_USER"),
-        metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
-        project=EnvVar("GCS_PROJECT"),
-    )
-
-    with build_asset_context(
-        resources={
-            "ex21_validate_test": MlflowInterface(
-                experiment_name=experiment_name,
-                continue_run=False,
-                tracking_uri="sqlite:///:memory:",
-                cloud_interface=cloud_interface,
-            ),
-            "cloud_interface": cloud_interface,
-        }
-    ) as context:
-        ex21_validate(context)
-    run = _get_most_recent_run(experiment_name)
-    # TODO: add in actual metric checks once validation is ready
-    assert run.data.metrics["ratio_extracted"] == 1
-
-
-@pytest.fixture
-def model_checkpoint():
-    """Load model from tracking server and return."""
-    return load_model()
+    sec10k.defs.get_job_def("ex21_extraction_validation").execute_in_process()
 
+    run = get_most_recent_mlflow_run_factory("ex21_extraction_validation")
 
-def test_model_loading(model_checkpoint):
-    """Test loading a fine-tuned LayoutLM model from MLFlow."""
-    assert "model" in model_checkpoint
-    assert "tokenizer" in model_checkpoint
-
-
-def test_dataset_creation(test_dir):
-    pdf_dir = test_dir / "data/test_pdfs"
-    dataset = create_inference_dataset(pdfs_dir=pdf_dir)
-    assert dataset.shape == (2, 4)
-
-
-def test_ex21_inference_and_table_extraction(
-    test_dir, test_mlflow_init_func, model_checkpoint
-):
-    """Test performing inference and extracting an Ex. 21 table."""
-    model = model_checkpoint["model"]
-    processor = model_checkpoint["tokenizer"]
-    pdf_dir = test_dir / "data" / "test_pdfs"
-    extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-    ).set_index("filename")
-    with unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"):
-        test_mlflow_init_func()
-        logit_list, pred_list, output_df, extraction_metadata = perform_inference(
-            pdfs_dir=pdf_dir,
-            model=model,
-            processor=processor,
-            extraction_metadata=extraction_metadata,
-            device="cpu",
-        )
-    # we don't normally want to sort by id and subsidiary
-    # but sort here for the sake of just testing whether dataframe
-    # row values are the same without worrying about order
-    output_df = output_df.sort_values(by=["id", "subsidiary"]).reset_index(drop=True)
-    # TODO: uncomment with new model checkpoint and 7th label included
-    # assert logit_list[0].shape == (1, 512, len(LABELS))
-    expected_out_path = test_dir / "data" / "inference_and_extraction_expected_out.csv"
-    expected_out_df = pd.read_csv(
-        expected_out_path,
-        dtype={"id": str, "subsidiary": str, "loc": str, "own_per": np.float64},
-    )
-    expected_out_df["own_per"] = expected_out_df["own_per"].astype(str)
-    expected_out_df = clean_extracted_df(expected_out_df)
-    expected_out_df = expected_out_df.sort_values(by=["id", "subsidiary"]).reset_index(
-        drop=True
-    )
-    assert_frame_equal(expected_out_df, output_df, check_like=True)
+    assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85
+    assert run.data.metrics["avg_location_jaccard_sim"] > 0.9

From 0fd8ffc0c336a381eb8776d63d5f524acdf425f4 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 12:15:51 -0400
Subject: [PATCH 028/161] Don't run ruff on notebooks

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 4b516d1..7c1fe9f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -33,7 +33,7 @@ skip_install = false
 extras =
     test
 commands =
-    ruff check ./
+    ruff check ./src/
 
 [testenv:pre_commit]
 description = Run git pre-commit hooks not covered by the other linters.

From 97d558721dda2c6b8291aa583683d98b0e0d903e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 12:29:03 -0400
Subject: [PATCH 029/161] xfail ex21 integration test

---
 tests/integration/models/sec10k/extract_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index dce8577..21be89b 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -3,6 +3,7 @@
 import logging
 
 import dotenv
+import pytest
 from mozilla_sec_eia.models import sec10k
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -22,6 +23,7 @@ def test_basic_10k_validation(
     assert run.data.metrics["recall"] == 1
 
 
+@pytest.mark.xfail
 def test_ex21_validation(
     test_tracker_factory,
     get_most_recent_mlflow_run_factory,

From ace268bcdf185eed620bfc8fe2d76bbeaf126287 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 23:01:29 -0400
Subject: [PATCH 030/161] Add parquet upath io-manager

---
 .../library/generic_io_managers.py            | 21 +++++++++++++++++++
 src/mozilla_sec_eia/models/sec10k/__init__.py |  5 +++++
 .../models/sec10k/basic_10k.py                |  1 +
 .../models/sec10k/ex_21/__init__.py           |  1 +
 4 files changed, 28 insertions(+)
 create mode 100644 src/mozilla_sec_eia/library/generic_io_managers.py

diff --git a/src/mozilla_sec_eia/library/generic_io_managers.py b/src/mozilla_sec_eia/library/generic_io_managers.py
new file mode 100644
index 0000000..e85aa68
--- /dev/null
+++ b/src/mozilla_sec_eia/library/generic_io_managers.py
@@ -0,0 +1,21 @@
+"""Implement useful generic io-managers."""
+
+import pandas as pd
+from dagster import InputContext, OutputContext, UPathIOManager
+from upath import UPath
+
+
+class PandasParquetIOManager(UPathIOManager):
+    """Read and write pandas dataframes as parquet files on local or remote filesystem."""
+
+    extension: str = ".parquet"
+
+    def dump_to_path(self, context: OutputContext, obj: pd.DataFrame, path: UPath):
+        """Write parquet."""
+        with path.open("wb") as file:
+            obj.to_parquet(file)
+
+    def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame:
+        """Read parquet."""
+        with path.open("rb") as file:
+            return pd.read_parquet(file)
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 7abbb4e..a04adc0 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -5,8 +5,10 @@
     load_assets_from_modules,
     load_assets_from_package_module,
 )
+from upath import UPath
 
 from mozilla_sec_eia.library import model_jobs
+from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager
 from mozilla_sec_eia.library.mlflow import (
     MlflowInterface,
     mlflow_interface_resource,
@@ -64,6 +66,9 @@
         "layoutlm_io_manager": LayoutlmIOManager(
             mlflow_interface=mlflow_interface_resource
         ),
+        "pandas_parquet_io_manager": PandasParquetIOManager(
+            base_path=UPath("gs://sec10k-outputs")
+        ),
     }
     | mlflow_train_test_io_managers
     | extract.SEC10k_EXTRACTOR_RESOURCES,
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index 4c943c6..5768279 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -161,6 +161,7 @@ def basic_10k_validation_filing_metadata(
     basic_10k_extractor_resource,
     extraction_metadata_asset_name="basic_10k_extraction_metadata",
     extracted_asset_name="basic_10k_company_info",
+    io_manager_key="pandas_parquet_io_manager",
 )
 
 
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 39cd2d6..f27b48a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -147,6 +147,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame):
     exhibit_21_extractor_resource,
     extraction_metadata_asset_name="ex21_extraction_metadata",
     extracted_asset_name="ex21_company_ownership_info",
+    io_manager_key="pandas_parquet_io_manager",
 )
 
 

From fb1feeb9926c03fad5cb4d3336e5b5669e1d8b95 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Sep 2024 23:02:33 -0400
Subject: [PATCH 031/161] Remove nb-output clear

---
 .pre-commit-config.yaml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 54d4a43..543d414 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -64,15 +64,6 @@ repos:
   ########################################################################################
   - repo: local
     hooks:
-      # clear outputs from Jupyter notebooks
-      - id: nb-output-clear
-        name: nb-output-clear
-        stages: [commit]
-        language: system
-        verbose: false
-        pass_filenames: false
-        always_run: true
-        entry: find notebooks \( -name \*.ipynb -not -name \*checkpoint.ipynb \) -type f -exec jupyter nbconvert --clear-output {} \;
       # Run the unit tests
       - id: unit-tests
         name: unit-tests

From 294ec721cec05aee4d8e81fca7f1e186d045b340 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 5 Sep 2024 12:50:52 -0400
Subject: [PATCH 032/161] Test docker deployment

---
 dagster.yaml                         | 48 ++++++++++++++++++++++++++++
 docker/deployment/Dockerfile         | 10 ++++++
 docker/deployment/docker-compose.yml | 34 ++++++++++++++++++++
 3 files changed, 92 insertions(+)
 create mode 100644 dagster.yaml
 create mode 100644 docker/deployment/Dockerfile
 create mode 100644 docker/deployment/docker-compose.yml

diff --git a/dagster.yaml b/dagster.yaml
new file mode 100644
index 0000000..92abbe6
--- /dev/null
+++ b/dagster.yaml
@@ -0,0 +1,48 @@
+run_coordinator:
+  module: dagster.core.run_coordinator
+  class: QueuedRunCoordinator
+  config:
+    tag_concurrency_limits:
+      - key: "dagster/backfill"
+        limit: 4
+run_storage:
+  module: dagster_postgres.run_storage
+  class: PostgresRunStorage
+  config:
+    postgres_db:
+      hostname: dagster_postgresql
+      username:
+        env: DAGSTER_POSTGRES_USER
+      password:
+        env: DAGSTER_POSTGRES_PASSWORD
+      db_name:
+        env: DAGSTER_POSTGRES_DB
+      port: 5432
+
+schedule_storage:
+  module: dagster_postgres.schedule_storage
+  class: PostgresScheduleStorage
+  config:
+    postgres_db:
+      hostname: docker_example_postgresql
+      username:
+        env: DAGSTER_POSTGRES_USER
+      password:
+        env: DAGSTER_POSTGRES_PASSWORD
+      db_name:
+        env: DAGSTER_POSTGRES_DB
+      port: 5432
+
+event_log_storage:
+  module: dagster_postgres.event_log
+  class: PostgresEventLogStorage
+  config:
+    postgres_db:
+      hostname: docker_example_postgresql
+      username:
+        env: DAGSTER_POSTGRES_USER
+      password:
+        env: DAGSTER_POSTGRES_PASSWORD
+      db_name:
+        env: DAGSTER_POSTGRES_DB
+      port: 5432
diff --git a/docker/deployment/Dockerfile b/docker/deployment/Dockerfile
new file mode 100644
index 0000000..facc384
--- /dev/null
+++ b/docker/deployment/Dockerfile
@@ -0,0 +1,10 @@
+FROM continuumio/miniconda3:24.7.1-0
+
+WORKDIR /opt/dagster/app
+
+# Build environment
+COPY . .
+RUN conda env create -f environment.yml && conda activate mozilla-sec-eia
+
+EXPOSE 3000
+ENTRYPOINT ["dagster", "dev"]
diff --git a/docker/deployment/docker-compose.yml b/docker/deployment/docker-compose.yml
new file mode 100644
index 0000000..abed8bc
--- /dev/null
+++ b/docker/deployment/docker-compose.yml
@@ -0,0 +1,34 @@
+services:
+  # This service runs the postgres DB used by dagster for run storage, schedule storage,
+  # and event log storage.
+  dagster_postgresql:
+    image: postgres:16
+    environment:
+      POSTGRES_USER: "postgres_user"
+      POSTGRES_PASSWORD: "postgres_password"
+      POSTGRES_DB: "postgres_db"
+    networks:
+      - dagster
+
+  dagster_pudl_models:
+    build:
+      context: ../../
+      dockerfile: ./docker/deployment/Dockerfile
+    restart: always
+    environment:
+      DAGSTER_POSTGRES_USER: "postgres_user"
+      DAGSTER_POSTGRES_PASSWORD: "postgres_password"
+      DAGSTER_POSTGRES_DB: "postgres_db"
+      GCS_FILINGS_BUCKET_NAME: "2de2b9f52c99a240-bucket-sec-10ks"
+      GCS_LABELS_BUCKET_NAME: "labeled-ex21-filings"
+      GCS_METADATA_DB_INSTANCE_CONNECTION: "catalyst-cooperative-mozilla:us-central1:pg-mozilla"
+      GCS_METADATA_DB_NAME: "postgres"
+      GCS_IAM_USER: "mozilla-dev-sa@catalyst-cooperative-mozilla.iam.gserviceaccount.com"
+      MLFLOW_TRACKING_URI: "https://mlflow-ned2up6sra-uc.a.run.app"
+      GCS_PROJECT: "catalyst-cooperative-mozilla"
+    networks:
+      - dagster
+
+networks:
+  dagster:
+    driver: bridge

From 4de51b34e2652d0b063143d04e9074b9b33c4303 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 5 Sep 2024 17:53:06 -0400
Subject: [PATCH 033/161] Chunk ex 21 extraction

---
 .../models/sec10k/ex_21/inference.py          | 87 +++++++++++--------
 .../models/sec10k/utils/pdf.py                |  4 +-
 2 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 235b9c3..4c68638 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -1,6 +1,7 @@
 """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model."""
 
 import logging
+import math
 import os
 import tempfile
 from contextlib import contextmanager
@@ -195,6 +196,7 @@ class Exhibit21Extractor(Sec10kExtractor):
     device: str = "cpu"
     has_labels: bool = False
     dataset_ind: list | None = None
+    filing_chunk_size: int = 8
     _pdf_dir: Path = PrivateAttr()
     _labeled_json_dir: Path | None = PrivateAttr(default=None)
 
@@ -254,45 +256,56 @@ def extract_filings(
         filings_with_ex21 = filing_metadata[
             ~filing_metadata["exhibit_21_version"].isna()
         ]
-        self.cloud_interface.get_filings(
-            filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
-        )
-        dataset = create_inference_dataset(
-            pdfs_dir=Path(self._pdf_dir),
-            labeled_json_dir=self._labeled_json_dir,
-            has_labels=self.has_labels,
-        )
-        if self.dataset_ind:
-            dataset = dataset.select(self.dataset_ind)
-
-        # TODO: figure out device argument
-        model, processor = self.layoutlm.get_model_components()
-        pipe = pipeline(
-            "token-classification",
-            model=model,
-            tokenizer=processor,
-            pipeline_class=LayoutLMInferencePipeline,
-            device=self.device,
+
+        filing_chunks = np.array_split(
+            filings_with_ex21,
+            math.ceil(len(filings_with_ex21) / self.filing_chunk_size),
         )
 
-        logits = []
-        predictions = []
-        all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
-        extraction_metadata = pd.DataFrame(
-            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-        ).set_index("filename")
-        for logit, pred, output_df in pipe(_get_data(dataset)):
-            logits.append(logit)
-            predictions.append(pred)
-            if not output_df.empty:
-                filename = get_metadata_filename(output_df["id"].iloc[0])
-                extraction_metadata.loc[filename, ["success"]] = True
-            all_output_df = pd.concat([all_output_df, output_df])
-        all_output_df.columns.name = None
-        all_output_df = clean_extracted_df(all_output_df)
-        all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
-        all_output_df = all_output_df.reset_index(drop=True)
-        return extraction_metadata, all_output_df
+        all_outputs_dfs = []
+        extraction_metadata_dfs = []
+        for filings in filing_chunks:
+            self.cloud_interface.get_filings(
+                filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
+            )
+            dataset = create_inference_dataset(
+                pdfs_dir=Path(self._pdf_dir),
+                labeled_json_dir=self._labeled_json_dir,
+                has_labels=self.has_labels,
+            )
+            if self.dataset_ind:
+                dataset = dataset.select(self.dataset_ind)
+
+            # TODO: figure out device argument
+            model, processor = self.layoutlm.get_model_components()
+            pipe = pipeline(
+                "token-classification",
+                model=model,
+                tokenizer=processor,
+                pipeline_class=LayoutLMInferencePipeline,
+                device=self.device,
+            )
+
+            logits = []
+            predictions = []
+            all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
+            extraction_metadata = pd.DataFrame(
+                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+            ).set_index("filename")
+            for logit, pred, output_df in pipe(_get_data(dataset)):
+                logits.append(logit)
+                predictions.append(pred)
+                if not output_df.empty:
+                    filename = get_metadata_filename(output_df["id"].iloc[0])
+                    extraction_metadata.loc[filename, ["success"]] = True
+                all_output_df = pd.concat([all_output_df, output_df])
+            all_output_df.columns.name = None
+            all_output_df = clean_extracted_df(all_output_df)
+            all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
+            all_output_df = all_output_df.reset_index(drop=True)
+            all_outputs_dfs.append(all_output_df)
+            extraction_metadata_dfs.append(extraction_metadata)
+        return pd.concat(extraction_metadata_dfs), pd.concat(all_output_df)
 
 
 class LayoutLMInferencePipeline(Pipeline):
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
index a8c6411..df9be07 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
@@ -122,7 +122,9 @@ def combine_doc_pages(doc):
         combined_height += pg_txt_height
 
     output_pdf = fitz.open()
-    combined_page = output_pdf.new_page(width=combined_width, height=combined_height)
+    combined_page = output_pdf.new_page(
+        width=float(combined_width), height=float(combined_height)
+    )
 
     for i in range(len(doc)):
         if i in blank_page_nums:

From 214e28f02f6417cb282cf64594f2e46aaf74dade Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 5 Sep 2024 20:06:55 -0400
Subject: [PATCH 034/161] Fix asign copy

---
 dagster.yaml                                  | 41 -------------------
 .../models/sec10k/ex_21/inference.py          |  4 +-
 2 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/dagster.yaml b/dagster.yaml
index 92abbe6..eba0c5f 100644
--- a/dagster.yaml
+++ b/dagster.yaml
@@ -5,44 +5,3 @@ run_coordinator:
     tag_concurrency_limits:
       - key: "dagster/backfill"
         limit: 4
-run_storage:
-  module: dagster_postgres.run_storage
-  class: PostgresRunStorage
-  config:
-    postgres_db:
-      hostname: dagster_postgresql
-      username:
-        env: DAGSTER_POSTGRES_USER
-      password:
-        env: DAGSTER_POSTGRES_PASSWORD
-      db_name:
-        env: DAGSTER_POSTGRES_DB
-      port: 5432
-
-schedule_storage:
-  module: dagster_postgres.schedule_storage
-  class: PostgresScheduleStorage
-  config:
-    postgres_db:
-      hostname: docker_example_postgresql
-      username:
-        env: DAGSTER_POSTGRES_USER
-      password:
-        env: DAGSTER_POSTGRES_PASSWORD
-      db_name:
-        env: DAGSTER_POSTGRES_DB
-      port: 5432
-
-event_log_storage:
-  module: dagster_postgres.event_log
-  class: PostgresEventLogStorage
-  config:
-    postgres_db:
-      hostname: docker_example_postgresql
-      username:
-        env: DAGSTER_POSTGRES_USER
-      password:
-        env: DAGSTER_POSTGRES_PASSWORD
-      db_name:
-        env: DAGSTER_POSTGRES_DB
-      port: 5432
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 4c68638..76116c3 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -426,7 +426,9 @@ def extract_table(self, all_outputs):
         first_in_group_df = df[
             (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other")
         ]
-        first_in_group_df["iob_pred"] = "B" + first_in_group_df["iob_pred"].str[1:]
+        first_in_group_df.loc[:, "iob_pred"] = (
+            "B" + first_in_group_df[:, "iob_pred"].str[1:]
+        )
         df.update(first_in_group_df)
         # filter for just words that were labeled with non "other" entities
         entities_df = df.sort_values(by=["top_left_y", "top_left_x"])

From c5736e04f402d9be04398150706dfee8159090a1 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 12:49:57 -0400
Subject: [PATCH 035/161] Add job for testing ex21 resource usage

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  5 +++++
 .../models/sec10k/ex_21/__init__.py           | 22 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index a04adc0..17c3758 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -45,6 +45,10 @@
     ex_21.validation_assets,
 )
 
+ex21_test_job = model_jobs.create_validation_model_job(
+    "ex21_test", [ex_21.test_extraction_metrics]
+)
+
 layoutlm_finetune_job = model_jobs.create_training_job(
     "layoutlm_finetune",
     layoutlm_assets,
@@ -58,6 +62,7 @@
         basic_10k_validation_job,
         ex21_production_job,
         ex21_validation_job,
+        ex21_test_job,
         layoutlm_finetune_job,
     ],
     resources={
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index f27b48a..4b6b3cf 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1,10 +1,11 @@
 """Module for working with exhibit 21 data."""
 
+import mlflow
 import pandas as pd
 from dagster import AssetIn, AssetOut, asset, multi_asset
 
 from mozilla_sec_eia.library import validation_helpers
-from mozilla_sec_eia.library.mlflow import mlflow_interface_resource
+from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
 
 from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
@@ -138,6 +139,25 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame):
     return validation_df
 
 
+@asset
+def test_extraction_metrics(
+    cloud_interface: GCSArchive,
+    exhibit21_extractor: Exhibit21Extractor,
+    mlflow_interface: MlflowInterface,
+):
+    """Run extraction with various numbers of filings to view resource usage."""
+    filings = cloud_interface.get_metadata()
+    for num_filings in [8, 16, 32, 64, 128]:
+        with mlflow.start_run(
+            run_name=f"extract_{num_filings}_filings",
+            nested=True,
+            parent_run_id=mlflow_interface.mlflow_run_id,
+            experiment_id=MlflowInterface.get_or_create_experiment("ex21_test"),
+        ):
+            mlflow.log_param("num_filings", num_filings)
+            exhibit21_extractor.extract_filings(filings.sample(num_filings))
+
+
 exhibit_21_extractor_resource = Exhibit21Extractor(
     cloud_interface=cloud_interface_resource,
     layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource),

From ec396334830301c45cbbb36618a1f484780e7493 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 12:51:06 -0400
Subject: [PATCH 036/161] Remove test docker files

---
 docker/deployment/Dockerfile         | 10 --------
 docker/deployment/docker-compose.yml | 34 ----------------------------
 2 files changed, 44 deletions(-)
 delete mode 100644 docker/deployment/Dockerfile
 delete mode 100644 docker/deployment/docker-compose.yml

diff --git a/docker/deployment/Dockerfile b/docker/deployment/Dockerfile
deleted file mode 100644
index facc384..0000000
--- a/docker/deployment/Dockerfile
+++ /dev/null
@@ -1,10 +0,0 @@
-FROM continuumio/miniconda3:24.7.1-0
-
-WORKDIR /opt/dagster/app
-
-# Build environment
-COPY . .
-RUN conda env create -f environment.yml && conda activate mozilla-sec-eia
-
-EXPOSE 3000
-ENTRYPOINT ["dagster", "dev"]
diff --git a/docker/deployment/docker-compose.yml b/docker/deployment/docker-compose.yml
deleted file mode 100644
index abed8bc..0000000
--- a/docker/deployment/docker-compose.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-services:
-  # This service runs the postgres DB used by dagster for run storage, schedule storage,
-  # and event log storage.
-  dagster_postgresql:
-    image: postgres:16
-    environment:
-      POSTGRES_USER: "postgres_user"
-      POSTGRES_PASSWORD: "postgres_password"
-      POSTGRES_DB: "postgres_db"
-    networks:
-      - dagster
-
-  dagster_pudl_models:
-    build:
-      context: ../../
-      dockerfile: ./docker/deployment/Dockerfile
-    restart: always
-    environment:
-      DAGSTER_POSTGRES_USER: "postgres_user"
-      DAGSTER_POSTGRES_PASSWORD: "postgres_password"
-      DAGSTER_POSTGRES_DB: "postgres_db"
-      GCS_FILINGS_BUCKET_NAME: "2de2b9f52c99a240-bucket-sec-10ks"
-      GCS_LABELS_BUCKET_NAME: "labeled-ex21-filings"
-      GCS_METADATA_DB_INSTANCE_CONNECTION: "catalyst-cooperative-mozilla:us-central1:pg-mozilla"
-      GCS_METADATA_DB_NAME: "postgres"
-      GCS_IAM_USER: "mozilla-dev-sa@catalyst-cooperative-mozilla.iam.gserviceaccount.com"
-      MLFLOW_TRACKING_URI: "https://mlflow-ned2up6sra-uc.a.run.app"
-      GCS_PROJECT: "catalyst-cooperative-mozilla"
-    networks:
-      - dagster
-
-networks:
-  dagster:
-    driver: bridge

From 101ccf17d28c4bba489a66b3603c821ec70955da Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 14:24:05 -0400
Subject: [PATCH 037/161] Remove complex asset factory

---
 .../library/mlflow/mlflow_io_managers.py      |   1 +
 src/mozilla_sec_eia/models/sec10k/__init__.py |   5 +-
 .../models/sec10k/basic_10k.py                | 249 +++++++++---------
 .../models/sec10k/ex_21/__init__.py           |  68 +++--
 .../models/sec10k/ex_21/inference.py          |   7 +-
 src/mozilla_sec_eia/models/sec10k/extract.py  |  69 -----
 tests/unit/models/sec10k/extract_test.py      |  95 ++-----
 7 files changed, 199 insertions(+), 295 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 7aa05d7..0fa03b7 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -66,6 +66,7 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str:
 
     def handle_output(self, context: OutputContext, df: pd.DataFrame):
         """Attach dataframe to run as artifact."""
+        print("HERE")
         if self.file_type == "csv":
             self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv")
         else:
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 17c3758..b01d144 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -10,7 +10,6 @@
 from mozilla_sec_eia.library import model_jobs
 from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager
 from mozilla_sec_eia.library.mlflow import (
-    MlflowInterface,
     mlflow_interface_resource,
     mlflow_train_test_io_managers,
 )
@@ -74,7 +73,7 @@
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs")
         ),
+        "exhibit21_extractor": ex_21.exhibit_21_extractor_resource,
     }
-    | mlflow_train_test_io_managers
-    | extract.SEC10k_EXTRACTOR_RESOURCES,
+    | mlflow_train_test_io_managers,
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index 5768279..b790de5 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -3,109 +3,99 @@
 import logging
 
 import pandas as pd
-from dagster import AssetIn, asset
+from dagster import AssetIn, AssetOut, asset, multi_asset
 
 from mozilla_sec_eia.library import validation_helpers
 
 from .extract import (
-    Sec10kExtractor,
-    sec10k_extraction_asset_factory,
     sec10k_filing_metadata,
 )
-from .utils.cloud import GCSArchive, Sec10K, cloud_interface_resource
+from .utils.cloud import GCSArchive, Sec10K
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-class Basic10kExtractor(Sec10kExtractor):
-    """Implement Sec10kExtractor for basic 10k company info data."""
-
-    name: str = "basic_10k_extractor"
-
-    def _extract_10k(self, filing: Sec10K):
-        """Extract basic company data from filing."""
-        logger.info(f"Extracting 10K company data from filing: {filing.filename}")
-        header = True
-        current_block = None
-        values = []
-        filer_count = 0
-        block_counts = {
-            "company data": 0,
-            "filing values": 0,
-            "business address": 0,
-            "mail address": 0,
-            "former company": 0,
-        }
-        unmatched_keys = []
-        for line in filing.filing_text.splitlines():
-            match line.replace("\t", "").lower().split(":"):
-                case ["filer", ""]:
-                    filer_count += 1
-                    header = False
-                case [
-                    (
-                        "company data"
-                        | "filing values"
-                        | "business address"
-                        | "mail address"
-                        | "former company"
-                    ) as block,
-                    "",
-                ] if not header:
-                    current_block = block
-                    block_counts[current_block] += 1
-                case [key, ""] if current_block is not None:
-                    key = f"{block}_{key}".replace(" ", "_")
-                    logger.warning(
-                        f"No value found for {key} for filing {filing.filename}"
-                    )
-                    unmatched_keys.append(key)
-                case [key, value] if current_block is not None:
-                    key = key.replace(" ", "_")
-                    values.append(
-                        {
-                            "filename": filing.filename,
-                            "filer_count": filer_count - 1,
-                            "block": current_block.replace(" ", "_"),
-                            "block_count": block_counts[current_block] - 1,
-                            "key": key.replace(" ", "_"),
-                            "value": value,
-                        }
-                    )
-                case ["</sec-header>" | "</ims-header>"]:
-                    break
-                case _ if header:
-                    continue
-
-        return pd.DataFrame(values), filing.filename, unmatched_keys
-
-    def extract_filings(
-        self,
-        filings_to_extract: pd.DataFrame,
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Extract basic 10K data and return extracted data/metadata."""
-        logger.info("Starting basic 10K extraction.")
-        logger.info(f"Extracting {len(filings_to_extract)} filings.")
-
-        extraction_metadata = pd.DataFrame(
-            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-        ).set_index("filename")
-        extracted = pd.DataFrame()
-
-        for filing in self.cloud_interface.iterate_filings(filings_to_extract):
-            ext, filename, unmatched_keys = self._extract_10k(filing)
-            extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
-                len(ext) > 0,
-                ",".join(unmatched_keys),
-            ]
-            extracted = pd.concat([extracted, ext])
-
-        return (
-            extraction_metadata,
-            extracted.set_index(
-                ["filename", "filer_count", "block", "block_count", "key"]
-            ),
-        )
+def _extract_10k(filing: Sec10K):
+    """Extract basic company data from filing."""
+    logger.info(f"Extracting 10K company data from filing: {filing.filename}")
+    header = True
+    current_block = None
+    values = []
+    filer_count = 0
+    block_counts = {
+        "company data": 0,
+        "filing values": 0,
+        "business address": 0,
+        "mail address": 0,
+        "former company": 0,
+    }
+    unmatched_keys = []
+    for line in filing.filing_text.splitlines():
+        match line.replace("\t", "").lower().split(":"):
+            case ["filer", ""]:
+                filer_count += 1
+                header = False
+            case [
+                (
+                    "company data"
+                    | "filing values"
+                    | "business address"
+                    | "mail address"
+                    | "former company"
+                ) as block,
+                "",
+            ] if not header:
+                current_block = block
+                block_counts[current_block] += 1
+            case [key, ""] if current_block is not None:
+                key = f"{block}_{key}".replace(" ", "_")
+                logger.warning(f"No value found for {key} for filing {filing.filename}")
+                unmatched_keys.append(key)
+            case [key, value] if current_block is not None:
+                key = key.replace(" ", "_")
+                values.append(
+                    {
+                        "filename": filing.filename,
+                        "filer_count": filer_count - 1,
+                        "block": current_block.replace(" ", "_"),
+                        "block_count": block_counts[current_block] - 1,
+                        "key": key.replace(" ", "_"),
+                        "value": value,
+                    }
+                )
+            case ["</sec-header>" | "</ims-header>"]:
+                break
+            case _ if header:
+                continue
+
+    return pd.DataFrame(values), filing.filename, unmatched_keys
+
+
+def extract_filings(
+    cloud_interface: GCSArchive,
+    filings_to_extract: pd.DataFrame,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Extract basic 10K data and return extracted data/metadata."""
+    logger.info("Starting basic 10K extraction.")
+    logger.info(f"Extracting {len(filings_to_extract)} filings.")
+
+    extraction_metadata = pd.DataFrame(
+        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+    ).set_index("filename")
+    extracted = pd.DataFrame()
+
+    for filing in cloud_interface.iterate_filings(filings_to_extract):
+        ext, filename, unmatched_keys = _extract_10k(filing)
+        extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
+            len(ext) > 0,
+            ",".join(unmatched_keys),
+        ]
+        extracted = pd.concat([extracted, ext])
+
+    return (
+        extraction_metadata,
+        extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]),
+    )
 
 
 @asset
@@ -117,25 +107,20 @@ def basic_10k_validation_set() -> pd.DataFrame:
     )
 
 
-basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation"
-
-
 @asset(
     ins={
-        basic_10k_extracted_validation_asset_name: AssetIn(
-            basic_10k_extracted_validation_asset_name
-        ),
-        "basic_10k_validation_set": AssetIn(),
+        "computed_df": AssetIn("basic_10k_company_info_validation"),
+        "validation_df": AssetIn("basic_10k_validation_set"),
     },
     io_manager_key="mlflow_metrics_io_manager",
 )
-def basic_10k_extraction_validation_metrics(**kwargs):
+def basic_10k_extraction_validation_metrics(
+    computed_df: pd.DataFrame,
+    validation_df: pd.DataFrame,
+):
     """Compute basic 10k extraction validation metrics."""
-    computed = kwargs[basic_10k_extracted_validation_asset_name]
-    validation = kwargs["basic_10k_validation_set"]
-
     return validation_helpers.pandas_compute_precision_recall(
-        computed, validation, value_col="value"
+        computed_df, validation_df, value_col="value"
     )
 
 
@@ -153,32 +138,48 @@ def basic_10k_validation_filing_metadata(
     ]
 
 
-basic_10k_extractor_resource = Basic10kExtractor(
-    cloud_interface=cloud_interface_resource
-)
-basic_10k_production_extraction = sec10k_extraction_asset_factory(
-    "basic_10k",
-    basic_10k_extractor_resource,
-    extraction_metadata_asset_name="basic_10k_extraction_metadata",
-    extracted_asset_name="basic_10k_company_info",
-    io_manager_key="pandas_parquet_io_manager",
+@multi_asset(
+    outs={
+        "basic_10k_extraction_metadata": AssetOut(
+            io_manager_key="pandas_parquet_io_manager"
+        ),
+        "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"),
+    },
 )
+def basic_10k_extract(
+    cloud_interface: GCSArchive,
+    sec10k_filing_metadata: pd.DataFrame,
+):
+    """Production asset for extracting basic 10k company info."""
+    metadata, extracted = extract_filings(cloud_interface, sec10k_filing_metadata)
+    return metadata, extracted
 
 
-basic_10k_validation_extraction = sec10k_extraction_asset_factory(
-    "basic_10k_validation",
-    basic_10k_extractor_resource,
-    filing_metadata_asset_name="basic_10k_validation_filing_metadata",
-    extraction_metadata_asset_name="basic_10k_extraction_validation_metadata",
-    extracted_asset_name=basic_10k_extracted_validation_asset_name,
-    partitions_def=None,
-    io_manager_key="mlflow_pandas_artifact_io_manager",
+@multi_asset(
+    outs={
+        "basic_10k_extraction_metadata_validation": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "basic_10k_company_info_validation": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+    },
 )
+def basic_10k_extract_validation(
+    cloud_interface: GCSArchive,
+    basic_10k_validation_filing_metadata: pd.DataFrame,
+):
+    """Production asset for extracting basic 10k company info."""
+    metadata, extracted = extract_filings(
+        cloud_interface, basic_10k_validation_filing_metadata
+    )
+    return metadata, extracted
+
 
-production_assets = [basic_10k_production_extraction, sec10k_filing_metadata]
+production_assets = [basic_10k_extract, sec10k_filing_metadata]
 
 validation_assets = [
-    basic_10k_validation_extraction,
+    basic_10k_extract_validation,
     basic_10k_validation_set,
     basic_10k_validation_filing_metadata,
     basic_10k_extraction_validation_metrics,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 4b6b3cf..7287084 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -7,7 +7,7 @@
 from mozilla_sec_eia.library import validation_helpers
 from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
 
-from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata
+from ..extract import sec10k_filing_metadata
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
 from ..utils.layoutlm import LayoutlmResource
 from .inference import Exhibit21Extractor, clean_extracted_df
@@ -15,7 +15,7 @@
 
 @asset
 def ex21_validation_set() -> pd.DataFrame:
-    """Return dataframe containing basic 10k validation data."""
+    """Return dataframe containing exhibit 21 validation data."""
     return clean_ex21_validation_set(
         validation_helpers.load_validation_data("ex21_labels.csv")
     )
@@ -33,12 +33,9 @@ def ex21_validation_filing_metadata(
     ]
 
 
-ex21_extracted_validation_asset_name = "ex21_validation"
-
-
 @multi_asset(
     ins={
-        "computed_df": AssetIn(ex21_extracted_validation_asset_name),
+        "computed_df": AssetIn("ex21_company_ownership_info_validation"),
         "validation_df": AssetIn("ex21_validation_set"),
     },
     outs={
@@ -158,36 +155,59 @@ def test_extraction_metrics(
             exhibit21_extractor.extract_filings(filings.sample(num_filings))
 
 
-exhibit_21_extractor_resource = Exhibit21Extractor(
-    cloud_interface=cloud_interface_resource,
-    layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource),
+@multi_asset(
+    outs={
+        "ex21_extraction_metadata": AssetOut(
+            io_manager_key="pandas_parquet_io_manager"
+        ),
+        "ex21_company_ownership_info": AssetOut(
+            io_manager_key="pandas_parquet_io_manager"
+        ),
+    }
 )
-ex21_production_extraction = sec10k_extraction_asset_factory(
-    "ex21",
-    exhibit_21_extractor_resource,
-    extraction_metadata_asset_name="ex21_extraction_metadata",
-    extracted_asset_name="ex21_company_ownership_info",
-    io_manager_key="pandas_parquet_io_manager",
+def ex21_extract(
+    sec10k_filing_metadata: pd.DataFrame,
+    exhibit21_extractor: Exhibit21Extractor,
+):
+    """Extract ownership info from exhibit 21 docs."""
+    metadata, extracted = exhibit21_extractor.extract_filings(sec10k_filing_metadata)
+    return metadata, extracted
+
+
+@multi_asset(
+    outs={
+        "ex21_extraction_metadata_validation": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+        "ex21_company_ownership_info_validation": AssetOut(
+            io_manager_key="mlflow_pandas_artifact_io_manager"
+        ),
+    }
 )
+def ex21_extract_validation(
+    ex21_validation_filing_metadata: pd.DataFrame,
+    exhibit21_extractor: Exhibit21Extractor,
+):
+    """Extract ownership info from exhibit 21 docs."""
+    metadata, extracted = exhibit21_extractor.extract_filings(
+        ex21_validation_filing_metadata
+    )
+    return metadata, extracted
 
 
-ex21_validation_extraction = sec10k_extraction_asset_factory(
-    "ex21_validation",
-    exhibit_21_extractor_resource,
-    filing_metadata_asset_name="ex21_validation_filing_metadata",
-    extraction_metadata_asset_name="ex21_extraction_validation_metadata",
-    extracted_asset_name=ex21_extracted_validation_asset_name,
-    partitions_def=None,
+exhibit_21_extractor_resource = Exhibit21Extractor(
+    cloud_interface=cloud_interface_resource,
+    layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource),
 )
 
 production_assets = [
     sec10k_filing_metadata,
-    ex21_production_extraction,
+    ex21_extract,
 ]
 
 validation_assets = [
     ex21_validation_set,
     ex21_validation_filing_metadata,
-    ex21_validation_extraction,
+    ex21_extract_validation,
     ex21_validation_metrics,
 ]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 76116c3..e3615de 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pandas as pd
 import torch
+from dagster import ConfigurableResource
 from datasets import Dataset
 from pydantic import PrivateAttr
 from transformers import (
@@ -18,8 +19,7 @@
 )
 from transformers.tokenization_utils_base import BatchEncoding
 
-from ..extract import Sec10kExtractor
-from ..utils.cloud import get_metadata_filename
+from ..utils.cloud import GCSArchive, get_metadata_filename
 from ..utils.layoutlm import (
     LayoutlmResource,
     get_id_label_conversions,
@@ -188,9 +188,10 @@ def _get_data(dataset):
     yield from dataset
 
 
-class Exhibit21Extractor(Sec10kExtractor):
+class Exhibit21Extractor(ConfigurableResource):
     """Implement `Sec10kExtractor` interface for exhibit 21 data."""
 
+    cloud_interface: GCSArchive
     layoutlm: LayoutlmResource
     name: str = "exhibit21_extractor"
     device: str = "cpu"
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 9db88bb..246c1b5 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -3,34 +3,12 @@
 import pandas as pd
 from dagster import (
     AssetExecutionContext,
-    AssetIn,
-    AssetOut,
-    ConfigurableResource,
     StaticPartitionsDefinition,
     asset,
-    multi_asset,
 )
 
 from .utils.cloud import GCSArchive
 
-SEC10k_EXTRACTOR_RESOURCES = {}
-
-
-class Sec10kExtractor(ConfigurableResource):
-    """Base class for extracting SEC 10k data."""
-
-    cloud_interface: GCSArchive
-    name: str
-
-    def extract_filings(
-        self, filing_metadata: pd.DataFrame
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Method must be implemented by subclasses to extract SEC10k filings."""
-        raise NotImplementedError(
-            "extract_filings must be implemented by any subclass!"
-        )
-
-
 # Create year_quarter partitions
 year_quarter_partitions = StaticPartitionsDefinition(
     [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)]
@@ -46,50 +24,3 @@ def sec10k_filing_metadata(
     year_quarter = context.partition_key
     df = cloud_interface.get_metadata(year_quarter=year_quarter)
     return df
-
-
-def sec10k_extraction_asset_factory(
-    name: str,
-    sec10k_extractor: Sec10kExtractor,
-    partitions_def=year_quarter_partitions,
-    filing_metadata_asset_name: str = "sec10k_filing_metadata",
-    extraction_metadata_asset_name: str = "extraction_metadata",
-    extracted_asset_name: str = "extraction_metadata",
-    io_manager_key: str | None = None,
-):
-    """Create asset to extract data from sec10k data.
-
-    Args:
-        name: Name of extraction asset.
-        sec10k_extractor: Subclass of Sec10kExtractor used to extract data.
-        partitions_def: Partitions for asset (production uses year_quarter parts,
-            validation is not partitioned.
-        filing_metadata_asset_name: Name of input asset with metadata of filings to
-            extract.
-        extraction_metadata_asset_name: Name of output asset containing metadata
-            from extraction run.
-        extracted_asset_name: Name of output asset containing extracted data.
-    """
-
-    @multi_asset(
-        name=name,
-        outs={
-            extraction_metadata_asset_name: AssetOut(io_manager_key=io_manager_key),
-            extracted_asset_name: AssetOut(io_manager_key=io_manager_key),
-        },
-        ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)},
-        partitions_def=partitions_def,
-        required_resource_keys={sec10k_extractor.name},
-    )
-    def extract_filings(
-        context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Run Sec10kExtractor on selected partition and return."""
-        extractor = context.resources.original_resource_dict[sec10k_extractor.name]
-        extraction_metadata, extracted = extractor.extract_filings(
-            sec10k_filing_metadata
-        )
-        return extraction_metadata, extracted
-
-    SEC10k_EXTRACTOR_RESOURCES[sec10k_extractor.name] = sec10k_extractor
-    return extract_filings
diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py
index 82efac4..a654392 100644
--- a/tests/unit/models/sec10k/extract_test.py
+++ b/tests/unit/models/sec10k/extract_test.py
@@ -3,86 +3,37 @@
 import logging
 from unittest.mock import Mock
 
+import dagster
 import pandas as pd
-from dagster import asset, build_asset_context, materialize
-from mozilla_sec_eia.models.sec10k.extract import (
-    Sec10kExtractor,
-    sec10k_extraction_asset_factory,
-    sec10k_filing_metadata,
-)
-from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
+import pytest
+from dagster import materialize
+from mozilla_sec_eia.models.sec10k.extract import sec10k_filing_metadata
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
 def test_sec10k_filing_metadata():
     """Test loading sec10k filing metadata."""
-    # Prepare inputs to sec10k_filing_metadata
-    context = build_asset_context(partition_key="2024q1")
-    cloud_interface = Mock()
-    output_df = pd.DataFrame({"col": ["fake_col"]})
-    cloud_interface.get_metadata.return_value = output_df
-
-    returned_df = sec10k_filing_metadata(
-        context=context,
-        cloud_interface=cloud_interface,
+    fake_cloud_interface = Mock()
+    df = pd.DataFrame({"fake_df": ["fake_col"]})
+    fake_cloud_interface.get_metadata.return_value = df
+    output = materialize(
+        [sec10k_filing_metadata],
+        partition_key="2020q1",
+        resources={"cloud_interface": fake_cloud_interface},
     )
 
-    # Check that GCSArchive.get_metadata was called correctly
-    cloud_interface.get_metadata.assert_called_once_with(year_quarter="2024q1")
-    pd.testing.assert_frame_equal(returned_df, output_df)
-
-
-def test_sec10k_extraction():
-    """Test loading sec10k filing metadata."""
-    fake_extraction_metadata = pd.DataFrame({"extraction_metadata": ["fake_col"]})
-    fake_extracted = pd.DataFrame({"extracted": ["fake_col"]})
-    fake_filing_metadata = pd.DataFrame({"filing_metadata": ["fake_col"]})
-
-    # Create fake Sec10kExtractor
-    class TestSec10kExtractor(Sec10kExtractor):
-        name: str = "test_extractor"
-
-        def extract_filings(self, filing_metadata):
-            pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata)
-            return fake_extraction_metadata, fake_extracted
-
-    # Create fake GCSArchive
-    class FakeArchive(GCSArchive):
-        filings_bucket_name: str = ""
-        labels_bucket_name: str = ""
-        metadata_db_instance_connection: str = ""
-        user: str = ""
-        metadata_db_name: str = ""
-        project: str = ""
+    fake_cloud_interface.get_metadata.assert_called_once_with(year_quarter="2020q1")
+    pd.testing.assert_frame_equal(df, output.asset_value("sec10k_filing_metadata"))
 
-        def setup_for_execution(self, context):
-            pass
 
-    # Asset to return fake filing metadata
-    @asset
-    def fake_filing_metadata_asset():
-        return fake_filing_metadata
-
-    # Create fake extraction asset with configured inputs
-    test_extractor = TestSec10kExtractor(cloud_interface=FakeArchive())
-    extraction_multi_asset = sec10k_extraction_asset_factory(
-        name="test_sec10k_extraction",
-        sec10k_extractor=test_extractor,
-        filing_metadata_asset_name="fake_filing_metadata_asset",
-        extracted_asset_name="test_sec10k_extraction",
-        extraction_metadata_asset_name="test_sec10k_extraction_metadata",
-        partitions_def=None,
-    )
-
-    # Run assets and review results
-    result = materialize(
-        [fake_filing_metadata_asset, extraction_multi_asset],
-        resources={test_extractor.name: test_extractor},
-    )
-    pd.testing.assert_frame_equal(
-        result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata
-    )
-    pd.testing.assert_frame_equal(
-        result.asset_value("test_sec10k_extraction"), fake_extracted
-    )
+def test_sec10k_filing_metadata_bad_return_type():
+    """Test loading sec10k_filing_metadata with bad return type."""
+    fake_cloud_interface = Mock()
+    fake_cloud_interface.get_metadata.return_value = "should be DataFrame"
+    with pytest.raises(dagster._core.errors.DagsterTypeCheckDidNotPass):
+        materialize(
+            [sec10k_filing_metadata],
+            partition_key="2020q1",
+            resources={"cloud_interface": fake_cloud_interface},
+        )

From 7e0c5a5dabc24b71fe926e91fe228520028e8ace Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 16:12:31 -0400
Subject: [PATCH 038/161] Parallelize ex21 extraction

---
 src/mozilla_sec_eia/library/model_jobs.py     | 29 +++++++++------
 src/mozilla_sec_eia/models/sec10k/__init__.py |  1 +
 .../models/sec10k/ex_21/__init__.py           | 35 +++++++++++++++----
 .../models/sec10k/ex_21/inference.py          |  2 +-
 src/mozilla_sec_eia/models/sec10k/extract.py  | 22 ++++++++++++
 5 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py
index 45602b4..87f6d15 100644
--- a/src/mozilla_sec_eia/library/model_jobs.py
+++ b/src/mozilla_sec_eia/library/model_jobs.py
@@ -24,23 +24,30 @@
 def create_production_model_job(
     job_name: str,
     assets: list[AssetsDefinition],
+    concurrency_limit: int | None = None,
     **kwargs,
 ) -> JobDefinition:
     """Construct a dagster job and supply Definitions with assets and resources."""
+    config = {
+        "ops": {},
+        "resources": {
+            "mlflow_interface": {
+                "config": {
+                    "experiment_name": job_name,
+                    "tracking_enabled": False,
+                }
+            }
+        },
+    }
+    if concurrency_limit is not None:
+        config["execution"] = {
+            "config": {"multiprocess": {"max_concurrent": concurrency_limit}}
+        }
+
     return define_asset_job(
         job_name,
         selection=assets,
-        config={
-            "ops": {},
-            "resources": {
-                "mlflow_interface": {
-                    "config": {
-                        "experiment_name": job_name,
-                        "tracking_enabled": False,
-                    }
-                }
-            },
-        },
+        config=config,
         **kwargs,
     )
 
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index b01d144..4148007 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -37,6 +37,7 @@
 ex21_production_job = model_jobs.create_production_model_job(
     "ex21_extraction",
     ex_21.production_assets,
+    concurrency_limit=4,
 )
 
 ex21_validation_job = model_jobs.create_validation_model_job(
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 7287084..06416b8 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -2,12 +2,12 @@
 
 import mlflow
 import pandas as pd
-from dagster import AssetIn, AssetOut, asset, multi_asset
+from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op
 
 from mozilla_sec_eia.library import validation_helpers
 from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
 
-from ..extract import sec10k_filing_metadata
+from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
 from ..utils.layoutlm import LayoutlmResource
 from .inference import Exhibit21Extractor, clean_extracted_df
@@ -155,7 +155,25 @@ def test_extraction_metrics(
             exhibit21_extractor.extract_filings(filings.sample(num_filings))
 
 
-@multi_asset(
+@op(out={"metadata": Out(), "extracted": Out()})
+def extract_filing_chunk(
+    exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Extract a set of filings and return results."""
+    metadata, extracted = exhibit21_extractor.extract_filings(filings)
+    return metadata, extracted
+
+
+@op(out={"metadata": Out(), "extracted": Out()})
+def collect_extracted_chunks(
+    metadata_dfs: list[pd.DataFrame],
+    extracted_dfs: list[pd.DataFrame],
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Collect chunks of extracted filings."""
+    return pd.concat(metadata_dfs), pd.concat(extracted_dfs)
+
+
+@graph_multi_asset(
     outs={
         "ex21_extraction_metadata": AssetOut(
             io_manager_key="pandas_parquet_io_manager"
@@ -163,14 +181,19 @@ def test_extraction_metrics(
         "ex21_company_ownership_info": AssetOut(
             io_manager_key="pandas_parquet_io_manager"
         ),
-    }
+    },
+    partitions_def=year_quarter_partitions,
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
-    exhibit21_extractor: Exhibit21Extractor,
 ):
     """Extract ownership info from exhibit 21 docs."""
-    metadata, extracted = exhibit21_extractor.extract_filings(sec10k_filing_metadata)
+    filing_chunks = chunk_filings(sec10k_filing_metadata)
+    metadata_chunks, extracted_chunks = filing_chunks.map(extract_filing_chunk)
+    metadata, extracted = collect_extracted_chunks(
+        metadata_chunks.collect(), extracted_chunks.collect()
+    )
+
     return metadata, extracted
 
 
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index e3615de..afdbab4 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -428,7 +428,7 @@ def extract_table(self, all_outputs):
             (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other")
         ]
         first_in_group_df.loc[:, "iob_pred"] = (
-            "B" + first_in_group_df[:, "iob_pred"].str[1:]
+            "B" + first_in_group_df["iob_pred"].str[1:]
         )
         df.update(first_in_group_df)
         # filter for just words that were labeled with non "other" entities
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 246c1b5..5547be0 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -1,10 +1,17 @@
 """Implement base class for an SEC10k extractor."""
 
+import math
+
+import numpy as np
 import pandas as pd
 from dagster import (
     AssetExecutionContext,
+    Config,
+    DynamicOut,
+    DynamicOutput,
     StaticPartitionsDefinition,
     asset,
+    op,
 )
 
 from .utils.cloud import GCSArchive
@@ -24,3 +31,18 @@ def sec10k_filing_metadata(
     year_quarter = context.partition_key
     df = cloud_interface.get_metadata(year_quarter=year_quarter)
     return df
+
+
+class ChunkFilingsConfig(Config):
+    """Set chunk size for chunk_filings."""
+
+    chunk_size: int = 128
+
+
+@op(out=DynamicOut())
+def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame:
+    """Split filings into chunks for parallel processing."""
+    for i, filing_chunk in enumerate(
+        np.array_split(filings, math.ceil(len(filings) / config.chunk_size))
+    ):
+        yield DynamicOutput(filing_chunk, mapping_key=str(i))

From 080d7908bfe00e284cd52603f3156efd0a3ff0e6 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 16:22:07 -0400
Subject: [PATCH 039/161] Don't chunk in inference module

---
 .../models/sec10k/ex_21/inference.py          | 86 ++++++++-----------
 1 file changed, 37 insertions(+), 49 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index afdbab4..80c2440 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -1,7 +1,6 @@
 """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model."""
 
 import logging
-import math
 import os
 import tempfile
 from contextlib import contextmanager
@@ -197,7 +196,6 @@ class Exhibit21Extractor(ConfigurableResource):
     device: str = "cpu"
     has_labels: bool = False
     dataset_ind: list | None = None
-    filing_chunk_size: int = 8
     _pdf_dir: Path = PrivateAttr()
     _labeled_json_dir: Path | None = PrivateAttr(default=None)
 
@@ -258,55 +256,45 @@ def extract_filings(
             ~filing_metadata["exhibit_21_version"].isna()
         ]
 
-        filing_chunks = np.array_split(
-            filings_with_ex21,
-            math.ceil(len(filings_with_ex21) / self.filing_chunk_size),
+        self.cloud_interface.get_filings(
+            filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
+        )
+        dataset = create_inference_dataset(
+            pdfs_dir=Path(self._pdf_dir),
+            labeled_json_dir=self._labeled_json_dir,
+            has_labels=self.has_labels,
+        )
+        if self.dataset_ind:
+            dataset = dataset.select(self.dataset_ind)
+
+        # TODO: figure out device argument
+        model, processor = self.layoutlm.get_model_components()
+        pipe = pipeline(
+            "token-classification",
+            model=model,
+            tokenizer=processor,
+            pipeline_class=LayoutLMInferencePipeline,
+            device=self.device,
         )
 
-        all_outputs_dfs = []
-        extraction_metadata_dfs = []
-        for filings in filing_chunks:
-            self.cloud_interface.get_filings(
-                filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
-            )
-            dataset = create_inference_dataset(
-                pdfs_dir=Path(self._pdf_dir),
-                labeled_json_dir=self._labeled_json_dir,
-                has_labels=self.has_labels,
-            )
-            if self.dataset_ind:
-                dataset = dataset.select(self.dataset_ind)
-
-            # TODO: figure out device argument
-            model, processor = self.layoutlm.get_model_components()
-            pipe = pipeline(
-                "token-classification",
-                model=model,
-                tokenizer=processor,
-                pipeline_class=LayoutLMInferencePipeline,
-                device=self.device,
-            )
-
-            logits = []
-            predictions = []
-            all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
-            extraction_metadata = pd.DataFrame(
-                {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-            ).set_index("filename")
-            for logit, pred, output_df in pipe(_get_data(dataset)):
-                logits.append(logit)
-                predictions.append(pred)
-                if not output_df.empty:
-                    filename = get_metadata_filename(output_df["id"].iloc[0])
-                    extraction_metadata.loc[filename, ["success"]] = True
-                all_output_df = pd.concat([all_output_df, output_df])
-            all_output_df.columns.name = None
-            all_output_df = clean_extracted_df(all_output_df)
-            all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
-            all_output_df = all_output_df.reset_index(drop=True)
-            all_outputs_dfs.append(all_output_df)
-            extraction_metadata_dfs.append(extraction_metadata)
-        return pd.concat(extraction_metadata_dfs), pd.concat(all_output_df)
+        logits = []
+        predictions = []
+        all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
+        extraction_metadata = pd.DataFrame(
+            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+        ).set_index("filename")
+        for logit, pred, output_df in pipe(_get_data(dataset)):
+            logits.append(logit)
+            predictions.append(pred)
+            if not output_df.empty:
+                filename = get_metadata_filename(output_df["id"].iloc[0])
+                extraction_metadata.loc[filename, ["success"]] = True
+            all_output_df = pd.concat([all_output_df, output_df])
+        all_output_df.columns.name = None
+        all_output_df = clean_extracted_df(all_output_df)
+        all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
+        all_output_df = all_output_df.reset_index(drop=True)
+        return extraction_metadata, all_output_df
 
 
 class LayoutLMInferencePipeline(Pipeline):

From 44dfc5299b185741f594f33e9322e18aa61e25c7 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 17:20:55 -0400
Subject: [PATCH 040/161] Handle failures in converting to pdf

---
 .../library/mlflow/mlflow_io_managers.py      |  1 -
 .../models/sec10k/ex_21/inference.py          | 40 ++++++++++++++++---
 .../models/sec10k/utils/cloud.py              |  8 +++-
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 0fa03b7..7aa05d7 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -66,7 +66,6 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str:
 
     def handle_output(self, context: OutputContext, df: pd.DataFrame):
         """Attach dataframe to run as artifact."""
-        print("HERE")
         if self.file_type == "csv":
             self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv")
         else:
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 80c2440..9912969 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -183,6 +183,34 @@ def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):
     return flattened_modes
 
 
+def _cache_pdfs(
+    filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path
+) -> pd.DataFrame:
+    """Iterate filings and cache pdfs."""
+    extraction_metadata = pd.DataFrame(
+        {
+            "filename": pd.Series(dtype=str),
+            "success": pd.Series(dtype=bool),
+            "notes": pd.Series(dtype=str),
+        }
+    ).set_index("filename")
+
+    for filing in cloud_interface.iterate_filings(filings):
+        pdf_path = cloud_interface.get_local_filename(
+            cache_directory=pdf_dir, filing=filing, extension=".pdf"
+        )
+
+        # Some filings are poorly formatted and fail in `save_as_pdf`
+        # We want a record of these but don't want to stop run
+        try:
+            with pdf_path.open("wb") as f:
+                filing.ex_21.save_as_pdf(f)
+        except Exception as e:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = str(e)
+    return extraction_metadata
+
+
 def _get_data(dataset):
     yield from dataset
 
@@ -202,6 +230,9 @@ class Exhibit21Extractor(ConfigurableResource):
     @contextmanager
     def yield_for_execution(self, context):
         """Setup temp path working directories."""
+        # Set env variable to improve GPU memory access
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+
         with (
             tempfile.TemporaryDirectory() as pdf_dir,
             tempfile.TemporaryDirectory() as labeled_json_dir,
@@ -256,8 +287,10 @@ def extract_filings(
             ~filing_metadata["exhibit_21_version"].isna()
         ]
 
-        self.cloud_interface.get_filings(
-            filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True
+        extraction_metadata = _cache_pdfs(
+            filings_with_ex21,
+            cloud_interface=self.cloud_interface,
+            pdf_dir=self._pdf_dir,
         )
         dataset = create_inference_dataset(
             pdfs_dir=Path(self._pdf_dir),
@@ -280,9 +313,6 @@ def extract_filings(
         logits = []
         predictions = []
         all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
-        extraction_metadata = pd.DataFrame(
-            {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
-        ).set_index("filename")
         for logit, pred, output_df in pipe(_get_data(dataset)):
             logits.append(logit)
             predictions.append(pred)
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 81c7a35..5346ec6 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -226,11 +226,15 @@ def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob:
         return self._filings_bucket.blob(f"sec10k/sec10k-{year_quarter}/{path}")
 
     def get_local_filename(
-        self, cache_directory: Path, filing: pd.Series, extension=".html"
+        self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html"
     ) -> Path:
         """Return path to a filing in local cache based on metadata."""
+        if isinstance(filing, pd.Series):
+            filename = filing["filename"]
+        else:
+            filename = filing.filename
         return cache_directory / Path(
-            f"{filing['filename'].replace('edgar/data/', '').replace('/', '-')}".replace(
+            f"{filename.replace('edgar/data/', '').replace('/', '-')}".replace(
                 ".txt", extension
             )
         )

From 6e24157c95cb022285dded95f4d09fb6c2150f59 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 6 Sep 2024 18:26:02 -0400
Subject: [PATCH 041/161] Delete cached pdfs early

---
 .../models/sec10k/ex_21/inference.py          | 41 ++++++++-----------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 9912969..fe36752 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -11,7 +11,6 @@
 import torch
 from dagster import ConfigurableResource
 from datasets import Dataset
-from pydantic import PrivateAttr
 from transformers import (
     Pipeline,
     pipeline,
@@ -224,24 +223,12 @@ class Exhibit21Extractor(ConfigurableResource):
     device: str = "cpu"
     has_labels: bool = False
     dataset_ind: list | None = None
-    _pdf_dir: Path = PrivateAttr()
-    _labeled_json_dir: Path | None = PrivateAttr(default=None)
 
     @contextmanager
-    def yield_for_execution(self, context):
-        """Setup temp path working directories."""
-        # Set env variable to improve GPU memory access
+    def setup_for_execution(self, context):
+        """Set env variable to improve GPU memory access."""
         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
-        with (
-            tempfile.TemporaryDirectory() as pdf_dir,
-            tempfile.TemporaryDirectory() as labeled_json_dir,
-        ):
-            self._pdf_dir = pdf_dir
-            if self.has_labels:
-                self._labeled_json_dir = labeled_json_dir
-            yield self
-
     def extract_filings(
         self, filing_metadata: pd.DataFrame
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
@@ -287,16 +274,20 @@ def extract_filings(
             ~filing_metadata["exhibit_21_version"].isna()
         ]
 
-        extraction_metadata = _cache_pdfs(
-            filings_with_ex21,
-            cloud_interface=self.cloud_interface,
-            pdf_dir=self._pdf_dir,
-        )
-        dataset = create_inference_dataset(
-            pdfs_dir=Path(self._pdf_dir),
-            labeled_json_dir=self._labeled_json_dir,
-            has_labels=self.has_labels,
-        )
+        with (
+            tempfile.TemporaryDirectory() as pdf_dir,
+            tempfile.TemporaryDirectory() as labeled_json_dir,
+        ):
+            extraction_metadata = _cache_pdfs(
+                filings_with_ex21,
+                cloud_interface=self.cloud_interface,
+                pdf_dir=pdf_dir,
+            )
+            dataset = create_inference_dataset(
+                pdfs_dir=Path(pdf_dir),
+                labeled_json_dir=labeled_json_dir,
+                has_labels=self.has_labels,
+            )
         if self.dataset_ind:
             dataset = dataset.select(self.dataset_ind)
 

From cd06d0709dd110ec212b5aff09aadb22d4048727 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 11:53:45 -0400
Subject: [PATCH 042/161] Add metadata to chunk_filings

---
 src/mozilla_sec_eia/models/sec10k/extract.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index 5547be0..acb885e 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -9,6 +9,7 @@
     Config,
     DynamicOut,
     DynamicOutput,
+    OpExecutionContext,
     StaticPartitionsDefinition,
     asset,
     op,
@@ -40,9 +41,12 @@ class ChunkFilingsConfig(Config):
 
 
 @op(out=DynamicOut())
-def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame:
+def chunk_filings(
+    context: OpExecutionContext, config: ChunkFilingsConfig, filings: pd.DataFrame
+) -> pd.DataFrame:
     """Split filings into chunks for parallel processing."""
     for i, filing_chunk in enumerate(
         np.array_split(filings, math.ceil(len(filings) / config.chunk_size))
     ):
+        context.add_output_metadata(metadata={"filings": list(filing_chunk.filename)})
         yield DynamicOutput(filing_chunk, mapping_key=str(i))

From e3e8c45895f9bea607982d287575c255baf5dce3 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 12:35:17 -0400
Subject: [PATCH 043/161] Catch oom errors while extracting ex21

---
 .../models/sec10k/ex_21/__init__.py           | 22 ++++++++++++++++++-
 src/mozilla_sec_eia/models/sec10k/extract.py  |  6 +----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 06416b8..35cdc10 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1,7 +1,10 @@
 """Module for working with exhibit 21 data."""
 
+import logging
+
 import mlflow
 import pandas as pd
+import torch
 from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op
 
 from mozilla_sec_eia.library import validation_helpers
@@ -12,6 +15,8 @@
 from ..utils.layoutlm import LayoutlmResource
 from .inference import Exhibit21Extractor, clean_extracted_df
 
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
 
 @asset
 def ex21_validation_set() -> pd.DataFrame:
@@ -160,7 +165,20 @@ def extract_filing_chunk(
     exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
-    metadata, extracted = exhibit21_extractor.extract_filings(filings)
+    try:
+        metadata, extracted = exhibit21_extractor.extract_filings(filings)
+    except torch.OutOfMemoryError:
+        logging.warning(
+            f"Ran out of memory while extracting filings: {filings['filename']}"
+        )
+        metadata = pd.DataFrame(
+            {
+                "filename": filings["filename"],
+                "success": [False] * len(filings),
+                "notes": ["Out of memory error"] * len(filings),
+            }
+        ).set_index("filename")
+        extracted = pd.DataFrame()
     return metadata, extracted
 
 
@@ -170,6 +188,8 @@ def collect_extracted_chunks(
     extracted_dfs: list[pd.DataFrame],
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Collect chunks of extracted filings."""
+    metadata_dfs = [df for df in metadata_dfs if not df.empty]
+    extracted_dfs = [df for df in extracted_dfs if not df.empty]
     return pd.concat(metadata_dfs), pd.concat(extracted_dfs)
 
 
diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py
index acb885e..5547be0 100644
--- a/src/mozilla_sec_eia/models/sec10k/extract.py
+++ b/src/mozilla_sec_eia/models/sec10k/extract.py
@@ -9,7 +9,6 @@
     Config,
     DynamicOut,
     DynamicOutput,
-    OpExecutionContext,
     StaticPartitionsDefinition,
     asset,
     op,
@@ -41,12 +40,9 @@ class ChunkFilingsConfig(Config):
 
 
 @op(out=DynamicOut())
-def chunk_filings(
-    context: OpExecutionContext, config: ChunkFilingsConfig, filings: pd.DataFrame
-) -> pd.DataFrame:
+def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame:
     """Split filings into chunks for parallel processing."""
     for i, filing_chunk in enumerate(
         np.array_split(filings, math.ceil(len(filings) / config.chunk_size))
     ):
-        context.add_output_metadata(metadata={"filings": list(filing_chunk.filename)})
         yield DynamicOutput(filing_chunk, mapping_key=str(i))

From 350defba0eee3576e79cbf81a428fcd210216351 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 13:16:01 -0400
Subject: [PATCH 044/161] Fix ex21 gcs io-manager

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 35cdc10..c2adf11 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -182,7 +182,12 @@ def extract_filing_chunk(
     return metadata, extracted
 
 
-@op(out={"metadata": Out(), "extracted": Out()})
+@op(
+    out={
+        "metadata": Out(io_manager_key="pandas_parquet_io_manager"),
+        "extracted": Out(io_manager_key="pandas_parquet_io_manager"),
+    }
+)
 def collect_extracted_chunks(
     metadata_dfs: list[pd.DataFrame],
     extracted_dfs: list[pd.DataFrame],

From 3c80b72fdb69890f05b5d9eabd5838b3784ddc30 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 14:21:53 -0400
Subject: [PATCH 045/161] Fix partitions for basic 10k extraction.

---
 src/mozilla_sec_eia/models/sec10k/basic_10k.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index b790de5..538d477 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -9,6 +9,7 @@
 
 from .extract import (
     sec10k_filing_metadata,
+    year_quarter_partitions,
 )
 from .utils.cloud import GCSArchive, Sec10K
 
@@ -145,6 +146,7 @@ def basic_10k_validation_filing_metadata(
         ),
         "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"),
     },
+    partitions_def=year_quarter_partitions,
 )
 def basic_10k_extract(
     cloud_interface: GCSArchive,

From 31971b7e6b1616a781b5ca59620a1aab10c89c7c Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 16:06:09 -0400
Subject: [PATCH 046/161] Cache layoutlm locally

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  7 +++-
 .../models/sec10k/ex_21/__init__.py           | 41 +++++++++++++------
 .../models/sec10k/ex_21/inference.py          |  4 +-
 .../models/sec10k/utils/layoutlm.py           | 16 ++++++++
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 4148007..986cf85 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -16,7 +16,7 @@
 
 from . import basic_10k, ex_21, extract
 from .utils.cloud import cloud_interface_resource
-from .utils.layoutlm import LayoutlmIOManager
+from .utils.layoutlm import LayoutlmIOManager, LayoutlmLocalIOManager
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
@@ -46,7 +46,7 @@
 )
 
 ex21_test_job = model_jobs.create_validation_model_job(
-    "ex21_test", [ex_21.test_extraction_metrics]
+    "ex21_test", [ex_21.test_extraction_metrics, ex_21.layoutlm_local_cache]
 )
 
 layoutlm_finetune_job = model_jobs.create_training_job(
@@ -75,6 +75,9 @@
             base_path=UPath("gs://sec10k-outputs")
         ),
         "exhibit21_extractor": ex_21.exhibit_21_extractor_resource,
+        "layoutlm_local_io_manager": LayoutlmLocalIOManager(
+            mlflow_interface=mlflow_interface_resource
+        ),
     }
     | mlflow_train_test_io_managers,
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index c2adf11..f70940f 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -12,7 +12,6 @@
 
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
-from ..utils.layoutlm import LayoutlmResource
 from .inference import Exhibit21Extractor, clean_extracted_df
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -162,14 +161,20 @@ def test_extraction_metrics(
 
 @op(out={"metadata": Out(), "extracted": Out()})
 def extract_filing_chunk(
-    exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame
+    exhibit21_extractor: Exhibit21Extractor,
+    filings: pd.DataFrame,
+    layoutlm,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
     try:
-        metadata, extracted = exhibit21_extractor.extract_filings(filings)
-    except torch.OutOfMemoryError:
+        metadata, extracted = exhibit21_extractor.extract_filings(
+            filings,
+            model=layoutlm["model"],
+            processor=layoutlm["processor"],
+        )
+    except (torch.OutOfMemoryError, RuntimeError) as e:
         logging.warning(
-            f"Ran out of memory while extracting filings: {filings['filename']}"
+            f"Error {str(e)} while extracting filings: {filings['filename']}"
         )
         metadata = pd.DataFrame(
             {
@@ -198,6 +203,15 @@ def collect_extracted_chunks(
     return pd.concat(metadata_dfs), pd.concat(extracted_dfs)
 
 
+@asset(
+    io_manager_key="layoutlm_local_io_manager",
+    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
+)
+def layoutlm_local_cache(layoutlm):
+    """Load pretrained layoutlm from mlflow and save to local path."""
+    return layoutlm
+
+
 @graph_multi_asset(
     outs={
         "ex21_extraction_metadata": AssetOut(
@@ -211,10 +225,13 @@ def collect_extracted_chunks(
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
+    layoutlm_local_cache,
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
-    metadata_chunks, extracted_chunks = filing_chunks.map(extract_filing_chunk)
+    metadata_chunks, extracted_chunks = filing_chunks.map(
+        lambda filings: extract_filing_chunk(filings, layoutlm_local_cache)
+    )
     metadata, extracted = collect_extracted_chunks(
         metadata_chunks.collect(), extracted_chunks.collect()
     )
@@ -235,27 +252,27 @@ def ex21_extract(
 def ex21_extract_validation(
     ex21_validation_filing_metadata: pd.DataFrame,
     exhibit21_extractor: Exhibit21Extractor,
+    layoutlm_local_cache,
 ):
     """Extract ownership info from exhibit 21 docs."""
     metadata, extracted = exhibit21_extractor.extract_filings(
-        ex21_validation_filing_metadata
+        ex21_validation_filing_metadata,
+        model=layoutlm_local_cache["model"],
+        processor=layoutlm_local_cache["processor"],
     )
     return metadata, extracted
 
 
 exhibit_21_extractor_resource = Exhibit21Extractor(
     cloud_interface=cloud_interface_resource,
-    layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource),
 )
 
-production_assets = [
-    sec10k_filing_metadata,
-    ex21_extract,
-]
+production_assets = [sec10k_filing_metadata, ex21_extract, layoutlm_local_cache]
 
 validation_assets = [
     ex21_validation_set,
     ex21_validation_filing_metadata,
     ex21_extract_validation,
     ex21_validation_metrics,
+    layoutlm_local_cache,
 ]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index fe36752..96b1672 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -19,7 +19,6 @@
 
 from ..utils.cloud import GCSArchive, get_metadata_filename
 from ..utils.layoutlm import (
-    LayoutlmResource,
     get_id_label_conversions,
     iob_to_label,
     normalize_bboxes,
@@ -218,7 +217,6 @@ class Exhibit21Extractor(ConfigurableResource):
     """Implement `Sec10kExtractor` interface for exhibit 21 data."""
 
     cloud_interface: GCSArchive
-    layoutlm: LayoutlmResource
     name: str = "exhibit21_extractor"
     device: str = "cpu"
     has_labels: bool = False
@@ -230,7 +228,7 @@ def setup_for_execution(self, context):
         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     def extract_filings(
-        self, filing_metadata: pd.DataFrame
+        self, filing_metadata: pd.DataFrame, model, processor
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
         """Predict entities with a fine-tuned model and extract Ex. 21 tables.
 
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index 1e88052..a10da6d 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -35,6 +35,22 @@ def load_input(self, context: InputContext) -> dict:
         return _load_pretrained_layoutlm(self.version)
 
 
+class LayoutlmLocalIOManager(MlflowBaseIOManager):
+    """Load and log mlflow models to local path."""
+
+    local_path: str = "./layoutlm"
+
+    def handle_output(self, context: OutputContext, components: dict):
+        """Load metrics to mlflow run/experiment created by `MlflowInterface`."""
+        mlflow.transformers.save_model(
+            components, path=self.local_path, task="token-classification"
+        )
+
+    def load_input(self, context: InputContext) -> dict:
+        """Log metrics to mlflow run/experiment created by `MlflowInterface`."""
+        return mlflow.transformers.load_model(self.local_path, return_type="components")
+
+
 class LayoutlmResource(ConfigurableResource):
     """Dagster resource for loading/using pretrained layoutlm model as a resource."""
 

From 634a050a1f598fe9ecd8585c69bd2b8c4b9de726 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 19:24:02 -0400
Subject: [PATCH 047/161] Fix caching model

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 4 ++--
 src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index f70940f..5a0991a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -170,7 +170,7 @@ def extract_filing_chunk(
         metadata, extracted = exhibit21_extractor.extract_filings(
             filings,
             model=layoutlm["model"],
-            processor=layoutlm["processor"],
+            processor=layoutlm["tokenizer"],
         )
     except (torch.OutOfMemoryError, RuntimeError) as e:
         logging.warning(
@@ -258,7 +258,7 @@ def ex21_extract_validation(
     metadata, extracted = exhibit21_extractor.extract_filings(
         ex21_validation_filing_metadata,
         model=layoutlm_local_cache["model"],
-        processor=layoutlm_local_cache["processor"],
+        processor=layoutlm_local_cache["tokenizer"],
     )
     return metadata, extracted
 
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index a10da6d..f1fcb48 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -15,7 +15,7 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict:
     """Function to load layoutlm from mlflow."""
     path = f"models:/layoutlm_extractor/{version}"
 
-    return mlflow.transformers.load_model(path, return_type="components")
+    return mlflow.transformers.load_model(path, return_type="pipeline")
 
 
 class LayoutlmIOManager(MlflowBaseIOManager):
@@ -40,10 +40,10 @@ class LayoutlmLocalIOManager(MlflowBaseIOManager):
 
     local_path: str = "./layoutlm"
 
-    def handle_output(self, context: OutputContext, components: dict):
+    def handle_output(self, context: OutputContext, pipeline):
         """Load metrics to mlflow run/experiment created by `MlflowInterface`."""
         mlflow.transformers.save_model(
-            components, path=self.local_path, task="token-classification"
+            pipeline, path=self.local_path, task="token-classification"
         )
 
     def load_input(self, context: InputContext) -> dict:

From 69ee4c04229178c1c6da451f9eb71e2d22e810d5 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 19:31:45 -0400
Subject: [PATCH 048/161] Remove bad call

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 96b1672..46ab425 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -290,7 +290,6 @@ def extract_filings(
             dataset = dataset.select(self.dataset_ind)
 
         # TODO: figure out device argument
-        model, processor = self.layoutlm.get_model_components()
         pipe = pipeline(
             "token-classification",
             model=model,

From 63d66006e724671c460dec8f164da2c5bb3e38d6 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 9 Sep 2024 20:03:04 -0400
Subject: [PATCH 049/161] Test own_per conversion

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 5a0991a..fdcf27a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -200,7 +200,10 @@ def collect_extracted_chunks(
     """Collect chunks of extracted filings."""
     metadata_dfs = [df for df in metadata_dfs if not df.empty]
     extracted_dfs = [df for df in extracted_dfs if not df.empty]
-    return pd.concat(metadata_dfs), pd.concat(extracted_dfs)
+    metadata_df = pd.concat(metadata_dfs)
+    extracted_df = pd.concat(extracted_dfs)
+    extracted_df["own_per"] = extracted_df["own_per"].astype("float64", errors="ignore")
+    return metadata_df, extracted_df
 
 
 @asset(

From c8490d470d66ea102c3c64f140aa77832e88c2e7 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 15:26:27 -0400
Subject: [PATCH 050/161] Add pandera types for output tables

---
 pyproject.toml                                |  1 +
 .../models/sec10k/basic_10k.py                | 17 +++++++---
 .../models/sec10k/ex_21/__init__.py           | 33 +++++++++++++++----
 .../models/sec10k/ex_21/inference.py          |  4 +++
 4 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1edfd0f..b026c10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things
     "dagster-mlflow",
     "dagster-webserver",
+    "dagster-pandera",
     "datasets>=2.1,<3", # Access Hugging Face datasets
     "seqeval>=1.2,<2", # Sequence labeling evaluation
     "google-cloud-secret-manager>=2,<3",
diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index 538d477..22b7a2a 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -7,6 +7,7 @@
 
 from mozilla_sec_eia.library import validation_helpers
 
+from .entities import basic_10k_extract_type, sec10k_extract_metadata_type
 from .extract import (
     sec10k_filing_metadata,
     year_quarter_partitions,
@@ -99,7 +100,7 @@ def extract_filings(
     )
 
 
-@asset
+@asset(dagster_type=basic_10k_extract_type)
 def basic_10k_validation_set() -> pd.DataFrame:
     """Return dataframe containing basic 10k validation data."""
     return validation_helpers.load_validation_data(
@@ -142,9 +143,13 @@ def basic_10k_validation_filing_metadata(
 @multi_asset(
     outs={
         "basic_10k_extraction_metadata": AssetOut(
-            io_manager_key="pandas_parquet_io_manager"
+            io_manager_key="pandas_parquet_io_manager",
+            dagster_type=sec10k_extract_metadata_type,
+        ),
+        "basic_10k_company_info": AssetOut(
+            io_manager_key="pandas_parquet_io_manager",
+            dagster_type=basic_10k_extract_type,
         ),
-        "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"),
     },
     partitions_def=year_quarter_partitions,
 )
@@ -160,10 +165,12 @@ def basic_10k_extract(
 @multi_asset(
     outs={
         "basic_10k_extraction_metadata_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
+            io_manager_key="mlflow_pandas_artifact_io_manager",
+            dagster_type=sec10k_extract_metadata_type,
         ),
         "basic_10k_company_info_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
+            io_manager_key="mlflow_pandas_artifact_io_manager",
+            dagster_type=basic_10k_extract_type,
         ),
     },
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index fdcf27a..323f115 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -10,6 +10,11 @@
 from mozilla_sec_eia.library import validation_helpers
 from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
 
+from ..entities import (
+    Ex21CompanyOwnership,
+    ex21_extract_type,
+    sec10k_extract_metadata_type,
+)
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
 from .inference import Exhibit21Extractor, clean_extracted_df
@@ -17,7 +22,7 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-@asset
+@asset(dagster_type=ex21_extract_type)
 def ex21_validation_set() -> pd.DataFrame:
     """Return dataframe containing exhibit 21 validation data."""
     return clean_ex21_validation_set(
@@ -159,7 +164,12 @@ def test_extraction_metrics(
             exhibit21_extractor.extract_filings(filings.sample(num_filings))
 
 
-@op(out={"metadata": Out(), "extracted": Out()})
+@op(
+    out={
+        "metadata": Out(dagster_type=sec10k_extract_metadata_type),
+        "extracted": Out(dagster_type=ex21_extract_type),
+    }
+)
 def extract_filing_chunk(
     exhibit21_extractor: Exhibit21Extractor,
     filings: pd.DataFrame,
@@ -183,14 +193,21 @@ def extract_filing_chunk(
                 "notes": ["Out of memory error"] * len(filings),
             }
         ).set_index("filename")
-        extracted = pd.DataFrame()
+        extracted = Ex21CompanyOwnership.example(size=0)
+    extracted.own_per = extracted.own_per.astype("float64")
     return metadata, extracted
 
 
 @op(
     out={
-        "metadata": Out(io_manager_key="pandas_parquet_io_manager"),
-        "extracted": Out(io_manager_key="pandas_parquet_io_manager"),
+        "metadata": Out(
+            io_manager_key="pandas_parquet_io_manager",
+            dagster_type=sec10k_extract_metadata_type,
+        ),
+        "extracted": Out(
+            io_manager_key="pandas_parquet_io_manager",
+            dagster_type=ex21_extract_type,
+        ),
     }
 )
 def collect_extracted_chunks(
@@ -245,10 +262,12 @@ def ex21_extract(
 @multi_asset(
     outs={
         "ex21_extraction_metadata_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
+            io_manager_key="mlflow_pandas_artifact_io_manager",
+            dagster_type=sec10k_extract_metadata_type,
         ),
         "ex21_company_ownership_info_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
+            io_manager_key="mlflow_pandas_artifact_io_manager",
+            dagster_type=ex21_extract_type,
         ),
     }
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 46ab425..27bd2dc 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -120,6 +120,10 @@ def clean_extracted_df(extracted_df):
         extracted_df["own_per"] = extracted_df["own_per"].str.replace(
             r"[^\d.]", "", regex=True
         )
+        # Find values with multiple decimal points
+        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
+            r"(\d*\.\d+)\..*", r"\1", regex=True
+        )
         extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan)
         extracted_df["own_per"] = extracted_df["own_per"].astype(
             "float64", errors="ignore"

From fa4f57da3fc9d4ec50570380578dd476728d47d5 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 15:31:31 -0400
Subject: [PATCH 051/161] Add missing entities module

---
 src/mozilla_sec_eia/models/sec10k/entities.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 src/mozilla_sec_eia/models/sec10k/entities.py

diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py
new file mode 100644
index 0000000..6abc4c8
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/entities.py
@@ -0,0 +1,57 @@
+"""Define table structure SEC10k extraction."""
+
+import pandera as pa
+from dagster_pandera import pandera_schema_to_dagster_type
+from pandera.typing import Index, Series
+
+
+class Ex21CompanyOwnership(pa.DataFrameModel):
+    """Define table structure for extracted EX 21 data."""
+
+    _id: Series[str] = pa.Field(alias="id", description="ID of extracted filing.")
+    subsidiary: Series[str] = pa.Field(description="Name of subsidiary company.")
+    loc: Series[str] = pa.Field(
+        description="Location of subsidiary company.", nullable=True
+    )
+    own_per: Series[float] = pa.Field(
+        description="Percent ownership of subsidiary company.",
+        nullable=True,
+        coerce=True,
+    )
+
+
+class Basic10kCompanyInfo(pa.DataFrameModel):
+    """Define table structure for extracted basic 10k data."""
+
+    filename: Index[str] = pa.Field(description="Name of extracted filing.")
+    filer_count: Index[str] = pa.Field(
+        description="Some filings have multiple blocks of company data."
+    )
+    block: Index[str] = pa.Field(description="Block of company data.")
+    block_count: Index[str] = pa.Field(description="Some blocks occur multiple times.")
+    key: Index[str] = pa.Field(description="Key within block.")
+    value: Series[str] = pa.Field(description="Company info fact.")
+
+    class Config:
+        """Provide multi index options in the config."""
+
+        multiindex_name = "time"
+        multiindex_strict = True
+        multiindex_coerce = True
+
+
+class Sec10kExtractionMetadata(pa.DataFrameModel):
+    """Define table structure extraction metadata."""
+
+    filename: Index[str] = pa.Field(description="Name of extracted filing.")
+    success: Series[bool] = pa.Field(
+        description="Indicates whether filing was successfully extracted.", coerce=True
+    )
+    notes: Series[str] = pa.Field(
+        description="Optional notes about extraction.", nullable=True
+    )
+
+
+ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership)
+basic_10k_extract_type = pandera_schema_to_dagster_type(Basic10kCompanyInfo)
+sec10k_extract_metadata_type = pandera_schema_to_dagster_type(Sec10kExtractionMetadata)

From 35e917d1ca7e498b84ceeeff0b8d688801c8dcf5 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 16:18:35 -0400
Subject: [PATCH 052/161] Don't cache model, load with io manager

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  2 +-
 .../models/sec10k/ex_21/__init__.py           | 26 +++++++------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 986cf85..f7f5ba0 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -46,7 +46,7 @@
 )
 
 ex21_test_job = model_jobs.create_validation_model_job(
-    "ex21_test", [ex_21.test_extraction_metrics, ex_21.layoutlm_local_cache]
+    "ex21_test", [ex_21.test_extraction_metrics]
 )
 
 layoutlm_finetune_job = model_jobs.create_training_job(
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 323f115..42b5dc1 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -223,15 +223,6 @@ def collect_extracted_chunks(
     return metadata_df, extracted_df
 
 
-@asset(
-    io_manager_key="layoutlm_local_io_manager",
-    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
-)
-def layoutlm_local_cache(layoutlm):
-    """Load pretrained layoutlm from mlflow and save to local path."""
-    return layoutlm
-
-
 @graph_multi_asset(
     outs={
         "ex21_extraction_metadata": AssetOut(
@@ -241,16 +232,17 @@ def layoutlm_local_cache(layoutlm):
             io_manager_key="pandas_parquet_io_manager"
         ),
     },
+    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
     partitions_def=year_quarter_partitions,
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
-    layoutlm_local_cache,
+    layoutlm,
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
     metadata_chunks, extracted_chunks = filing_chunks.map(
-        lambda filings: extract_filing_chunk(filings, layoutlm_local_cache)
+        lambda filings: extract_filing_chunk(filings, layoutlm)
     )
     metadata, extracted = collect_extracted_chunks(
         metadata_chunks.collect(), extracted_chunks.collect()
@@ -269,18 +261,19 @@ def ex21_extract(
             io_manager_key="mlflow_pandas_artifact_io_manager",
             dagster_type=ex21_extract_type,
         ),
-    }
+    },
+    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
 )
 def ex21_extract_validation(
     ex21_validation_filing_metadata: pd.DataFrame,
     exhibit21_extractor: Exhibit21Extractor,
-    layoutlm_local_cache,
+    layoutlm,
 ):
     """Extract ownership info from exhibit 21 docs."""
     metadata, extracted = exhibit21_extractor.extract_filings(
         ex21_validation_filing_metadata,
-        model=layoutlm_local_cache["model"],
-        processor=layoutlm_local_cache["tokenizer"],
+        model=layoutlm["model"],
+        processor=layoutlm["tokenizer"],
     )
     return metadata, extracted
 
@@ -289,12 +282,11 @@ def ex21_extract_validation(
     cloud_interface=cloud_interface_resource,
 )
 
-production_assets = [sec10k_filing_metadata, ex21_extract, layoutlm_local_cache]
+production_assets = [sec10k_filing_metadata, ex21_extract]
 
 validation_assets = [
     ex21_validation_set,
     ex21_validation_filing_metadata,
     ex21_extract_validation,
     ex21_validation_metrics,
-    layoutlm_local_cache,
 ]

From a7b1c7fe24cea739886b3aa64191c00378ed32fa Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 16:30:05 -0400
Subject: [PATCH 053/161] Remove float conversion

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 42b5dc1..562f7e1 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -194,7 +194,6 @@ def extract_filing_chunk(
             }
         ).set_index("filename")
         extracted = Ex21CompanyOwnership.example(size=0)
-    extracted.own_per = extracted.own_per.astype("float64")
     return metadata, extracted
 
 

From f01911757f6ca9675ea250a79529cafaeac54762 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 16:44:36 -0400
Subject: [PATCH 054/161] Add hypothesis to deps

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b026c10..92796d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "seqeval>=1.2,<2", # Sequence labeling evaluation
     "google-cloud-secret-manager>=2,<3",
     "google-cloud-storage>=2,<3",
+    "hypothesis",
     "matplotlib>=3.8,<4",
     "mlflow>=2.12",
     "opencv-python",

From d7d13d8de55100859b0125036dcf85068b6bdd42 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 16:45:26 -0400
Subject: [PATCH 055/161] Make own_per str

---
 src/mozilla_sec_eia/models/sec10k/entities.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py
index 6abc4c8..b0f6869 100644
--- a/src/mozilla_sec_eia/models/sec10k/entities.py
+++ b/src/mozilla_sec_eia/models/sec10k/entities.py
@@ -13,7 +13,8 @@ class Ex21CompanyOwnership(pa.DataFrameModel):
     loc: Series[str] = pa.Field(
         description="Location of subsidiary company.", nullable=True
     )
-    own_per: Series[float] = pa.Field(
+    #: Use str to avoid conversion errors
+    own_per: Series[str] = pa.Field(
         description="Percent ownership of subsidiary company.",
         nullable=True,
         coerce=True,

From 70f529371b2c6de3bb80911f0ae3d5b611162ae0 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 17:01:56 -0400
Subject: [PATCH 056/161] Remove astype

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 562f7e1..38a4295 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -218,7 +218,6 @@ def collect_extracted_chunks(
     extracted_dfs = [df for df in extracted_dfs if not df.empty]
     metadata_df = pd.concat(metadata_dfs)
     extracted_df = pd.concat(extracted_dfs)
-    extracted_df["own_per"] = extracted_df["own_per"].astype("float64", errors="ignore")
     return metadata_df, extracted_df
 
 

From e4060926e7d4166ce842329db584837643d62c63 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 17:26:52 -0400
Subject: [PATCH 057/161] Validate ex21 return types

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 38a4295..967c76a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -12,6 +12,7 @@
 
 from ..entities import (
     Ex21CompanyOwnership,
+    Sec10kExtractionMetadata,
     ex21_extract_type,
     sec10k_extract_metadata_type,
 )
@@ -218,7 +219,10 @@ def collect_extracted_chunks(
     extracted_dfs = [df for df in extracted_dfs if not df.empty]
     metadata_df = pd.concat(metadata_dfs)
     extracted_df = pd.concat(extracted_dfs)
-    return metadata_df, extracted_df
+    return (
+        Sec10kExtractionMetadata.validate(metadata_df),
+        Ex21CompanyOwnership.validate(extracted_df),
+    )
 
 
 @graph_multi_asset(

From f3835d999ee60d20ab9112899b75f076392b0757 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 21:09:45 -0400
Subject: [PATCH 058/161] Clean model download temp dir

---
 src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index f1fcb48..b45abd2 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -1,5 +1,7 @@
 """Util functions for training and predicting with LayoutLM on Ex. 21 tables."""
 
+import tempfile
+
 import mlflow
 from dagster import ConfigurableResource, InputContext, OutputContext
 from PIL import ImageDraw, ImageFont
@@ -15,7 +17,10 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict:
     """Function to load layoutlm from mlflow."""
     path = f"models:/layoutlm_extractor/{version}"
 
-    return mlflow.transformers.load_model(path, return_type="pipeline")
+    with tempfile.TemporaryDirectory() as dst_path:
+        return mlflow.transformers.load_model(
+            path, dst_path=dst_path, return_type="pipeline"
+        )
 
 
 class LayoutlmIOManager(MlflowBaseIOManager):

From 3c995cdac21b00a3a00cc477ce8207d04fa7ac7c Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Sep 2024 21:31:09 -0400
Subject: [PATCH 059/161] Fix model return type

---
 src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index b45abd2..0e24f8d 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -19,7 +19,7 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict:
 
     with tempfile.TemporaryDirectory() as dst_path:
         return mlflow.transformers.load_model(
-            path, dst_path=dst_path, return_type="pipeline"
+            path, dst_path=dst_path, return_type="components"
         )
 
 

From ef55e4bfb9dce01e14d4e2da324f2ebc2e912418 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 09:21:03 -0400
Subject: [PATCH 060/161] Catch errors in creating ex 21 dataset

---
 .../models/sec10k/ex_21/inference.py          | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 27bd2dc..41f7830 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -17,6 +17,7 @@
 )
 from transformers.tokenization_utils_base import BatchEncoding
 
+from ..entities import Ex21CompanyOwnership
 from ..utils.cloud import GCSArchive, get_metadata_filename
 from ..utils.layoutlm import (
     get_id_label_conversions,
@@ -285,11 +286,20 @@ def extract_filings(
                 cloud_interface=self.cloud_interface,
                 pdf_dir=pdf_dir,
             )
-            dataset = create_inference_dataset(
-                pdfs_dir=Path(pdf_dir),
-                labeled_json_dir=labeled_json_dir,
-                has_labels=self.has_labels,
-            )
+            try:
+                dataset = create_inference_dataset(
+                    pdfs_dir=Path(pdf_dir),
+                    labeled_json_dir=labeled_json_dir,
+                    has_labels=self.has_labels,
+                )
+            # TODO: Investigate failures in creating dataset
+            except KeyError:
+                logger.warning("Failed to create inference dataset!")
+                extraction_metadata.loc[:, "filename"] = False
+                extraction_metadata.loc[:, "notes"] = (
+                    "Failed to create inference dataset."
+                )
+                return extraction_metadata, Ex21CompanyOwnership.example(size=0)
         if self.dataset_ind:
             dataset = dataset.select(self.dataset_ind)
 

From b37450ae08a018cb2ad3c6e370834aa066374d4e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 11:21:49 -0400
Subject: [PATCH 061/161] Fix column name

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 41f7830..274ff29 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -295,7 +295,7 @@ def extract_filings(
             # TODO: Investigate failures in creating dataset
             except KeyError:
                 logger.warning("Failed to create inference dataset!")
-                extraction_metadata.loc[:, "filename"] = False
+                extraction_metadata.loc[:, "success"] = False
                 extraction_metadata.loc[:, "notes"] = (
                     "Failed to create inference dataset."
                 )

From 06b18ed4eb5a17660d1f3877f8226f71c6c3b001 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 20:15:54 -0400
Subject: [PATCH 062/161] Try to catch empty pdf errors

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 274ff29..5833de1 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -60,7 +60,10 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
             continue
         src_path = pdfs_dir / pdf_filename
         filename = Path(pdf_filename).stem
-        extracted, pg = get_pdf_data_from_path(src_path)
+        try:
+            extracted, pg = get_pdf_data_from_path(src_path)
+        except RuntimeError:
+            continue
         txt = extracted["pdf_text"]
         pg_meta = extracted["page"]
         # normalize bboxes between 0 and 1000 for Hugging Face

From abfc006f0f3da4e0db606c448bcb3f08c46d0b18 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 20:56:55 -0400
Subject: [PATCH 063/161] Print traceback in caught exception

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 967c76a..5f7590c 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1,6 +1,7 @@
 """Module for working with exhibit 21 data."""
 
 import logging
+import traceback
 
 import mlflow
 import pandas as pd
@@ -183,10 +184,9 @@ def extract_filing_chunk(
             model=layoutlm["model"],
             processor=layoutlm["tokenizer"],
         )
-    except (torch.OutOfMemoryError, RuntimeError) as e:
-        logging.warning(
-            f"Error {str(e)} while extracting filings: {filings['filename']}"
-        )
+    except (torch.OutOfMemoryError, RuntimeError):
+        logger.warning(traceback.format_exc())
+        logger.warning(f"Error while extracting filings: {filings['filename']}")
         metadata = pd.DataFrame(
             {
                 "filename": filings["filename"],

From ff92a55e99960362a8836542465c4a7e0971314a Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 21:03:59 -0400
Subject: [PATCH 064/161] Fix empty pdf check

---
 .../models/sec10k/ex_21/create_labeled_dataset.py         | 5 +++++
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py      | 8 ++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
index 55e1d5a..63be643 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
@@ -202,6 +202,11 @@ def get_image_dict(pdfs_dir):
         if pdf_filename.split(".")[-1] != "pdf":
             continue
         pdf_file_path = pdfs_dir / pdf_filename
+
+        # Check for empty file
+        if pdf_file_path.stat().st_size == 0:
+            continue
+
         _, pg = get_pdf_data_from_path(pdf_file_path)
         full_pg_img = render_page(pg)
         filename = pdf_filename.split(".")[0]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 5833de1..080b072 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -60,10 +60,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
             continue
         src_path = pdfs_dir / pdf_filename
         filename = Path(pdf_filename).stem
-        try:
-            extracted, pg = get_pdf_data_from_path(src_path)
-        except RuntimeError:
-            continue
+        extracted, pg = get_pdf_data_from_path(src_path)
         txt = extracted["pdf_text"]
         pg_meta = extracted["page"]
         # normalize bboxes between 0 and 1000 for Hugging Face
@@ -82,9 +79,8 @@ def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=F
     else:
         inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
     image_dict = get_image_dict(pdfs_dir)
-    doc_filenames = inference_df["id"].unique()
     annotations = []
-    for filename in doc_filenames:
+    for filename in image_dict:
         annotation = {
             "id": filename,
             "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename],

From 8aa8c9544f54c2c5fb822dca6cf4b171f490718b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Sep 2024 21:16:34 -0400
Subject: [PATCH 065/161] Actually fix empty pdf check?

---
 .../models/sec10k/ex_21/create_labeled_dataset.py          | 5 -----
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py       | 7 +++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
index 63be643..55e1d5a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
@@ -202,11 +202,6 @@ def get_image_dict(pdfs_dir):
         if pdf_filename.split(".")[-1] != "pdf":
             continue
         pdf_file_path = pdfs_dir / pdf_filename
-
-        # Check for empty file
-        if pdf_file_path.stat().st_size == 0:
-            continue
-
         _, pg = get_pdf_data_from_path(pdf_file_path)
         full_pg_img = render_page(pg)
         filename = pdf_filename.split(".")[0]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 080b072..ff4be90 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -210,6 +210,13 @@ def _cache_pdfs(
         except Exception as e:
             extraction_metadata.loc[filing.filename, ["success"]] = False
             extraction_metadata.loc[filing.filename, ["note"]] = str(e)
+
+        # Some pdfs are empty. Check for these and remove from dir
+        if pdf_path.stat().st_size == 0:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty"
+            pdf_path.unlink()
+
     return extraction_metadata
 
 

From 43600bc2ab128a4b0cff3032877fc9094d4c799f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 17 Sep 2024 21:26:24 -0400
Subject: [PATCH 066/161] Use UPath in GCSArchive

---
 .../models/sec10k/basic_10k.py                |  12 +-
 .../models/sec10k/ex_21/__init__.py           |   2 +-
 .../models/sec10k/utils/cloud.py              | 201 +++++-------------
 tests/unit/models/sec10k/utils_test.py        |  88 ++++----
 4 files changed, 105 insertions(+), 198 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
index 22b7a2a..a1497b1 100644
--- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py
+++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py
@@ -82,15 +82,19 @@ def extract_filings(
     logger.info(f"Extracting {len(filings_to_extract)} filings.")
 
     extraction_metadata = pd.DataFrame(
-        {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)}
+        {
+            "filename": pd.Series(dtype=str),
+            "success": pd.Series(dtype=bool),
+            "notes": pd.Series(dtype=str),
+        }
     ).set_index("filename")
     extracted = pd.DataFrame()
 
     for filing in cloud_interface.iterate_filings(filings_to_extract):
         ext, filename, unmatched_keys = _extract_10k(filing)
-        extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [
+        extraction_metadata.loc[filename, ["success", "notes"]] = [
             len(ext) > 0,
-            ",".join(unmatched_keys),
+            "Unmatched Keys: " + ",".join(unmatched_keys),
         ]
         extracted = pd.concat([extracted, ext])
 
@@ -134,7 +138,7 @@ def basic_10k_validation_filing_metadata(
     """Get sec 10k filing metadata from validation set."""
     filing_metadata = cloud_interface.get_metadata()
     return filing_metadata[
-        filing_metadata["filename"].isin(
+        filing_metadata.index.isin(
             basic_10k_validation_set.index.get_level_values("filename").unique()
         )
     ]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 5f7590c..890f44e 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -40,7 +40,7 @@ def ex21_validation_filing_metadata(
     """Get sec 10k filing metadata from validation set."""
     filing_metadata = cloud_interface.get_metadata()
     return filing_metadata[
-        filing_metadata["filename"].isin(ex21_validation_set["filename"].unique())
+        filing_metadata.index.isin(ex21_validation_set["filename"].unique())
     ]
 
 
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 5346ec6..930f44f 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -4,25 +4,18 @@
 import io
 import logging
 import re
-from contextlib import contextmanager
 from hashlib import md5
 from pathlib import Path
 from typing import BinaryIO, TextIO
 
 import fitz
 import pandas as pd
-import pg8000
-from dagster import ConfigurableResource, EnvVar
-from google.cloud import storage
-from google.cloud.sql.connector import Connector
+from dagster import ConfigurableResource
 from PIL import Image
-from pydantic import BaseModel, PrivateAttr
-from sqlalchemy import Engine, create_engine, select
-from sqlalchemy.orm import Session
+from pydantic import BaseModel
+from upath import UPath
 from xhtml2pdf import pisa
 
-from .db_metadata import Base, Sec10kMetadata
-
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
@@ -142,88 +135,36 @@ def from_file(
 
 
 class GCSArchive(ConfigurableResource):
-    """Provides an interface for archived filings on GCS.
-
-    This class looks for several environment variables to configure
-    access to cloud resources. These can be set directly, or be in a
-    .env file at the top level.
-
-    The following variables need to be set:
-
-    GCS_FILINGS_BUCKET_NAME: Name of bucket where 10k filings are stored.
-    GCS_LABELS_BUCKET_NAME: Name of top-level bucket where labelled training data is stored.
-    GCS_METADATA_DB_INSTANCE_CONNECTION: instance connection string
-    in the form 'project:region:instance'.
-    GCS_IAM_USER: Email of user of service account trying to connect.
-    GCS_METADATA_DB_NAME: Name of DB in instance to connect to.
-    GCS_PROJECT: Name of google cloud project.
-    MLFLOW_TRACKING_URI: URI of mlflow tracking server.
-    """
-
-    filings_bucket_name: str
-    labels_bucket_name: str
-    metadata_db_instance_connection: str
-    user: str
-    metadata_db_name: str
-    project: str
-
-    _filings_bucket = PrivateAttr()
-    _labels_bucket = PrivateAttr()
-    _engine = PrivateAttr()
-
-    def setup_for_execution(self, context):
-        """Initialize interface to filings archive on GCS."""
-        self._engine = self._get_engine()
-        self._filings_bucket = self._get_bucket(self.filings_bucket_name)
-        self._labels_bucket = self._get_bucket(self.labels_bucket_name)
-
-        Base.metadata.create_all(self._engine)
-
-    def _get_bucket(self, bucket_name):
-        """Return cloud storage bucket where SEC10k filings are archived."""
-        storage_client = storage.Client()
-        return storage_client.bucket(bucket_name)
-
-    def _get_engine(self) -> Engine:
-        """Initialize a connection pool for a Cloud SQL instance of Postgres.
-
-        Uses the Cloud SQL Python Connector with Automatic IAM Database Authentication.
-        """
-        # initialize Cloud SQL Python Connector object
-        connector = Connector()
-
-        def getconn() -> pg8000.dbapi.Connection:
-            conn: pg8000.dbapi.Connection = connector.connect(
-                self.metadata_db_instance_connection,
-                "pg8000",
-                user=self.user,
-                db=self.metadata_db_name,
-                enable_iam_auth=True,
-            )
-            return conn
+    """Provides an interface for archived filings on GCS."""
 
-        return create_engine(
-            "postgresql+pg8000://",
-            creator=getconn,
-        )
+    filings_bucket: str = "gs://2de2b9f52c99a240-bucket-sec-10ks/"
+    labels_bucket: str = "gs://labeled-ex21-filings/"
+    outputs_bucket: str = "gs://sec10k-outputs/"
+
+    @property
+    def filings_bucket_path(self):
+        """Return UPath of filings bucket."""
+        return UPath(self.filings_bucket)
 
-    @contextmanager
-    def create_session(self) -> Session:
-        """Yield sqlalchemy session."""
-        with Session(self._engine) as session:
-            yield session
+    @property
+    def labels_bucket_path(self):
+        """Return UPath of filings bucket."""
+        return UPath(self.labels_bucket)
+
+    @property
+    def outputs_bucket_path(self):
+        """Return UPath of filings bucket."""
+        return UPath(self.outputs_bucket)
 
     def get_metadata(self, year_quarter: str | None = None) -> pd:
         """Return dataframe of filing metadata."""
-        selection = select(Sec10kMetadata)
+        selection = None
         if year_quarter is not None:
-            selection = selection.where(Sec10kMetadata.year_quarter == year_quarter)
-
-        return pd.read_sql(selection, self._engine)
+            selection = ["year_quarter", "==", year_quarter]
 
-    def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob:
-        """Return Blob pointing to file in GCS bucket."""
-        return self._filings_bucket.blob(f"sec10k/sec10k-{year_quarter}/{path}")
+        return pd.read_parquet(
+            self.outputs_bucket_path / "sec10k_filing_metadata", filters=selection
+        )
 
     def get_local_filename(
         self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html"
@@ -239,29 +180,6 @@ def get_local_filename(
             )
         )
 
-    def cache_blob(
-        self,
-        blob: storage.Blob,
-        local_path: Path,
-    ) -> Path:
-        """Cache a single filing in cache_directory and return path."""
-        # Create cache directory
-        local_path.parent.mkdir(parents=True, exist_ok=True)
-
-        if exists := local_path.exists():
-            blob.update()
-            local_hash = _compute_md5(local_path)
-            remote_hash = blob.md5_hash
-            refresh = remote_hash != local_hash
-
-        if (not exists) or refresh:
-            logger.info(f"Downloading to {local_path}")
-            blob.download_to_filename(local_path)
-        else:
-            logger.info(f"{local_path} is already cached")
-
-        return local_path
-
     def get_filings(
         self,
         filing_selection: pd.DataFrame,
@@ -278,11 +196,12 @@ def get_filings(
         """
         filings = []
         for _, filing in filing_selection.iterrows():
-            blob = self.get_filing_blob(filing["year_quarter"], filing["filename"])
             local_path = self.get_local_filename(cache_directory, filing)
-            filing_path = self.cache_blob(blob, local_path)
+            if not local_path.exists():
+                with local_path.open("w") as f:
+                    f.write((self.filings_bucket_path / filing.filename).read_text())
 
-            with filing_path.open() as f:
+            with local_path.open() as f:
                 sec10k_filing = Sec10K.from_file(
                     file=f,
                     filename=filing["filename"],
@@ -317,14 +236,11 @@ def iterate_filings(
             filing_selection: Pandas dataframe with same schema as metadata df where each row
                 is a filing to return.
         """
-        for _, filing in filing_selection.iterrows():
+        for filename, filing in filing_selection.iterrows():
+            filepath = f"sec10k/sec10k-{filing.year_quarter}/{filename}"
             yield Sec10K.from_file(
-                file=io.StringIO(
-                    self.get_filing_blob(
-                        filing["year_quarter"], filing["filename"]
-                    ).download_as_text()
-                ),
-                filename=filing["filename"],
+                file=io.StringIO((self.filings_bucket_path / filepath).read_text()),
+                filename=filename,
                 cik=filing["cik"],
                 year_quarter=filing["year_quarter"],
                 ex_21_version=filing["exhibit_21_version"],
@@ -334,7 +250,7 @@ def cache_training_data(
         self,
         json_cache_path: Path,
         pdf_cache_path: Path,
-        gcs_folder_name: str = "labeled/",
+        gcs_folder_name: str = "labeledv0.2",
         overwrite_pdfs: bool = False,
     ):
         """Cache labeled training data stored on GCS for local use."""
@@ -342,36 +258,34 @@ def cache_training_data(
         pdf_cache_path.mkdir(parents=True, exist_ok=True)
         metadata_df = self.get_metadata()
         label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)")
-        if gcs_folder_name[-1] != "/":
-            gcs_folder_name += "/"
-        for blob in self._labels_bucket.list_blobs(match_glob=f"{gcs_folder_name}*"):
-            if blob.name == gcs_folder_name:
-                continue
 
+        # Cache filings and labels
+        filenames = []
+        direc = self.labels_bucket_path / gcs_folder_name
+        for file in direc.iterdir():
+            if file.name == gcs_folder_name:
+                continue
             # Cache labels
-            self.cache_blob(
-                blob, json_cache_path / blob.name.replace(gcs_folder_name, "")
-            )
+            with (json_cache_path / file.name).open("w") as f:
+                f.write(file.read_text())
 
             # Cache filing
-            match = label_name_pattern.search(blob.name)
-            filename = f"edgar/data/{match.group(1)}/{match.group(2)}.txt"
-            filing_metadata = metadata_df[metadata_df["filename"] == filename]
-            filing = self.get_filings(filing_metadata)[0]
-            pdf_path = self.get_local_filename(
-                pdf_cache_path, filing_metadata.iloc[0], extension=".pdf"
-            )
-            if not pdf_path.exists() or overwrite_pdfs:
-                with pdf_path.open("wb") as f:
-                    filing.ex_21.save_as_pdf(f)
+            match = label_name_pattern.search(file.name)
+            filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt")
+
+        filings = metadata_df[metadata_df["filename"].isin(filenames)]
+        self.get_filings(
+            filings,
+            cache_path=pdf_cache_path,
+            cache_pdf=True,
+        )
 
     def validate_archive(self) -> bool:
         """Validate that all filings described in metadata table exist in GCS bucket."""
         # Get files in archive
         logger.info("Get list of files in archive.")
         archive_filenames = {
-            re.sub(r"sec10k/sec10k-\d{4}q\d/", "", blob.name)
-            for blob in self._filings_bucket.list_blobs()
+            filing.name for filing in self.filings_bucket_path.iterdir()
         }
 
         # Get metadata df
@@ -400,11 +314,4 @@ def get_metadata_filename(local_filename: str):
     return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt"
 
 
-cloud_interface_resource = GCSArchive(
-    filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"),
-    labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"),
-    metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"),
-    user=EnvVar("GCS_IAM_USER"),
-    metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"),
-    project=EnvVar("GCS_PROJECT"),
-)
+cloud_interface_resource = GCSArchive()
diff --git a/tests/unit/models/sec10k/utils_test.py b/tests/unit/models/sec10k/utils_test.py
index de9fbd7..b31fc63 100644
--- a/tests/unit/models/sec10k/utils_test.py
+++ b/tests/unit/models/sec10k/utils_test.py
@@ -3,6 +3,7 @@
 import io
 import unittest
 from dataclasses import dataclass
+from pathlib import Path
 
 import pandas as pd
 import pytest
@@ -16,40 +17,29 @@
 @pytest.fixture
 def test_archive():
     """Return test GCSArchive class."""
-    with (
-        unittest.mock.patch(
-            "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_engine"
-        ),
-        unittest.mock.patch(
-            "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_bucket"
-        ),
-    ):
-        archive = GCSArchive(
-            filings_bucket_name="filings_bucket_name",
-            labels_bucket_name="labels_bucket_name",
-            metadata_db_instance_connection="metadata_db_instance_connection",
-            user="user",
-            metadata_db_name="metadata_db_name",
-            project="project_name",
-        )
-        archive.setup_for_execution("fake_context")
-        return archive
+    return GCSArchive()
 
 
 @dataclass
-class _FakeBlob:
-    name: str
+class _FakePath:
+    files: list[str]
+
+    def iterdir(self):
+        """Fake iterdir"""
+        yield from self.files
 
 
 @pytest.mark.parametrize(
     "archive_files,metadata_files,valid",
     [
         (
-            [
-                _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"),
-                _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"),
-                _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"),
-            ],
+            _FakePath(
+                files=[
+                    Path("sec10k/sec10k-1993q1/filing1.txt"),
+                    Path("sec10k/sec10k-1996q2/filing2.txt"),
+                    Path("sec10k/sec10k-2000q4/filing3.txt"),
+                ]
+            ),
             [
                 "filing1.txt",
                 "filing2.txt",
@@ -58,12 +48,14 @@ class _FakeBlob:
             True,
         ),
         (
-            [
-                _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"),
-                _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"),
-                _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"),
-                _FakeBlob("sec10k/sec10k-2001q3/filing4.txt"),
-            ],
+            _FakePath(
+                files=[
+                    Path("sec10k/sec10k-1993q1/filing1.txt"),
+                    Path("sec10k/sec10k-1996q2/filing2.txt"),
+                    Path("sec10k/sec10k-2000q4/filing3.txt"),
+                    Path("sec10k/sec10k-2001q3/filing4.txt"),
+                ]
+            ),
             [
                 "filing1.txt",
                 "filing2.txt",
@@ -72,11 +64,13 @@ class _FakeBlob:
             False,
         ),
         (
-            [
-                _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"),
-                _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"),
-                _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"),
-            ],
+            _FakePath(
+                files=[
+                    Path("sec10k/sec10k-1993q1/filing1.txt"),
+                    Path("sec10k/sec10k-1996q2/filing2.txt"),
+                    Path("sec10k/sec10k-2000q4/filing3.txt"),
+                ]
+            ),
             [
                 "filing1.txt",
                 "filing2.txt",
@@ -89,17 +83,19 @@ class _FakeBlob:
 )
 def test_validate_archive(test_archive, archive_files, metadata_files, valid, mocker):
     """Test archive validation functionality."""
-    test_archive._filings_bucket.list_blobs.return_value = archive_files
-
-    metadata_mock = mocker.MagicMock(
-        return_value=pd.DataFrame({"filename": metadata_files})
-    )
-    mocker.patch(
-        "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata",
-        new=metadata_mock,
-    )
+    with unittest.mock.patch(
+        "mozilla_sec_eia.models.sec10k.utils.GCSArchive.filings_bucket_path",
+        new=archive_files,
+    ):
+        metadata_mock = mocker.MagicMock(
+            return_value=pd.DataFrame({"filename": metadata_files})
+        )
+        mocker.patch(
+            "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata",
+            new=metadata_mock,
+        )
 
-    assert test_archive.validate_archive() == valid
+        assert test_archive.validate_archive() == valid
 
 
 @pytest.mark.parametrize(

From 05ad82cd97b5e6aee76dec738a79bce303cc2f5f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 18 Sep 2024 10:34:10 -0400
Subject: [PATCH 067/161] Make _configure_mlflow a standalone function

---
 .../library/mlflow/mlflow_resource.py         | 60 ++++++++++---------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
index 1060b9b..82710de 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
@@ -21,6 +21,37 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
+def _configure_mlflow(tracking_uri: str, project: str):
+    """Do runtime configuration of mlflow."""
+    os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
+    os.environ["MLFLOW_TRACKING_PASSWORD"] = _get_tracking_password(
+        tracking_uri, project
+    )
+    os.environ["MLFLOW_TRACKING_URI"] = tracking_uri
+    os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
+    os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
+    os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
+    os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
+
+
+def _get_tracking_password(tracking_uri: str, project: str, version_id: str = "latest"):
+    """Get tracking server password from gcloud secrets."""
+    # Password not required for local use
+    if "sqlite" not in tracking_uri:
+        # Create the Secret Manager client.
+        client = secretmanager.SecretManagerServiceClient()
+
+        # Build the resource name of the secret version.
+        name = f"projects/{project}/secrets/mlflow_admin_password/versions/{version_id}"
+
+        # Access the secret version.
+        response = client.access_secret_version(name=name)
+
+        # Return the decoded payload.
+        return response.payload.data.decode("UTF-8")
+    return ""
+
+
 class MlflowInterface(ConfigurableResource):
     """Dagster resource to interface with mlflow tracking server.
 
@@ -52,7 +83,7 @@ def yield_for_execution(
         """Create experiment tracker for specified experiment."""
         dagster_run_id = context.run_id
         self._mlflow_run_id = None
-        self._configure_mlflow()
+        _configure_mlflow(self.tracking_uri, self.project)
 
         if self.tracking_enabled:
             # Get run_id associated with current dagster run
@@ -75,33 +106,6 @@ def mlflow_run_id(self) -> str | None:
         """Return run id of current run."""
         return self._mlflow_run_id
 
-    def _get_tracking_password(self, version_id: str = "latest"):
-        """Get tracking server password from gcloud secrets."""
-        # Password not required for local use
-        if "sqlite" not in self.tracking_uri:
-            # Create the Secret Manager client.
-            client = secretmanager.SecretManagerServiceClient()
-
-            # Build the resource name of the secret version.
-            name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}"
-
-            # Access the secret version.
-            response = client.access_secret_version(name=name)
-
-            # Return the decoded payload.
-            return response.payload.data.decode("UTF-8")
-        return ""
-
-    def _configure_mlflow(self):
-        """Do runtime configuration of mlflow."""
-        os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
-        os.environ["MLFLOW_TRACKING_PASSWORD"] = self._get_tracking_password()
-        os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri
-        os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520"
-        os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520"
-        os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900"
-        os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
-
     @staticmethod
     def get_or_create_experiment(
         experiment_name: str, artifact_location: str = ""

From 99fc7edac96e8d8db5f4ebe934bcef6bbce5cd67 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 18 Sep 2024 12:39:59 -0400
Subject: [PATCH 068/161] Try to skip notebooks in ruff check

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 92796d1..dd39186 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -155,6 +155,7 @@ doctest_optionflags = [
 ]
 
 [tool.ruff]
+exclude = ["notebooks/*"]
 select = [
     "A", # flake8-builtins
     # "ARG", # unused arguments

From b13550052c39112ff3dd88cc2569978a3d5aef1e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 19 Sep 2024 10:43:57 -0400
Subject: [PATCH 069/161] Pull integration test fixes from main

---
 tests/conftest.py                             | 44 +++---------------
 .../integration/models/sec10k/extract_test.py | 46 +++++++++++++------
 2 files changed, 39 insertions(+), 51 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 87e44e5..d1a47d4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,13 +1,11 @@
 """PyTest configuration module. Defines useful fixtures, command line args."""
 
 import logging
+import os
 from pathlib import Path
 
-import mlflow
 import pytest
 
-from mozilla_sec_eia.library.mlflow import MlflowInterface
-
 logger = logging.getLogger(__name__)
 
 
@@ -38,41 +36,11 @@ def test_dir() -> Path:
     return Path(__file__).parent
 
 
-class TestTracker(MlflowInterface):
-    """Create sub-class of `MlflowInterface` to use in testing context.
-
-    Test class creates an in-memory sqlite db for tracking, and a temporary directory
-    for artifact storage.
-    """
-
-    def _get_tracking_password(self):
-        return "password"
-
-
 @pytest.fixture
-def test_tracker_factory(tmp_path):
-    def factory(experiment_name: str) -> TestTracker:
-        return TestTracker(
-            artifact_location=str(tmp_path),
-            tracking_uri="sqlite:///:memory:",
-            experiment_name=experiment_name,
-            project="",
-        )
+def set_test_mlflow_env_vars_factory():
+    def factory():
+        # Use in memory tracking backend unless USE_TRACKING_SERVER is set
+        if not os.getenv("USE_TRACKING_SERVER"):
+            os.environ["MLFLOW_TRACKING_URI"] = "sqlite:///:memory:"
 
     return factory
-
-
-@pytest.fixture
-def get_most_recent_mlflow_run_factory():
-    def _get_run(experiment_name: str):
-        """Search mlflow for most recent run with specified experiment name."""
-        run_metadata = mlflow.search_runs(
-            experiment_names=[experiment_name],
-        )
-
-        # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run
-        # This assert will ensure this doesn't silently break if the ordering changes
-        assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max()
-        return mlflow.get_run(run_metadata.loc[0, "run_id"])
-
-    return _get_run
diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index 1e2d6a8..26f7fd6 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -1,39 +1,59 @@
 """Validate basic 10k and exhibit 21 extraction."""
 
 import logging
+import os
+import unittest
 
 import dotenv
-import pytest
 
+from mozilla_sec_eia.library.mlflow.mlflow_resource import (
+    _configure_mlflow,
+    get_most_recent_run,
+)
 from mozilla_sec_eia.models import sec10k
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
+# TODO: Make validation tests log to tracking server on merge to main
+
 
 def test_basic_10k_validation(
-    test_tracker_factory,
-    get_most_recent_mlflow_run_factory,
+    set_test_mlflow_env_vars_factory,
 ):
     """Test basic_10k_validation_job."""
-    dotenv.load_dotenv()
-    sec10k.defs.get_job_def("basic_10k_extraction_validation").execute_in_process()
+    dotenv.load_dotenv(override=True)
+    set_test_mlflow_env_vars_factory()
+    result = sec10k.defs.get_job_def(
+        "basic_10k_extraction_validation"
+    ).execute_in_process()
 
-    run = get_most_recent_mlflow_run_factory("basic_10k_extraction_validation")
+    run = get_most_recent_run("basic_10k_extraction_validation", result.run_id)
 
     assert run.data.metrics["precision"] == 1
     assert run.data.metrics["recall"] == 1
 
 
-@pytest.mark.xfail
 def test_ex21_validation(
-    test_tracker_factory,
-    get_most_recent_mlflow_run_factory,
+    set_test_mlflow_env_vars_factory,
 ):
     """Test ex21_validation_job."""
-    dotenv.load_dotenv()
-    sec10k.defs.get_job_def("ex21_extraction_validation").execute_in_process()
-
-    run = get_most_recent_mlflow_run_factory("ex21_extraction_validation")
+    dotenv.load_dotenv(override=True)
+    _configure_mlflow(
+        os.getenv("MLFLOW_TRACKING_URI"),
+        os.getenv("GCS_PROJECT"),
+    )
+    pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm()
+
+    with unittest.mock.patch(
+        "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm",
+        new=lambda _: pretrained_model,
+    ):
+        set_test_mlflow_env_vars_factory()
+        result = sec10k.defs.get_job_def(
+            "ex21_extraction_validation"
+        ).execute_in_process()
+
+    run = get_most_recent_run("ex21_extraction_validation", result.run_id)
 
     assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85
     assert run.data.metrics["avg_location_jaccard_sim"] > 0.9

From 6e868f22e7f081d83e6c80fdb06f1680cc8b13b5 Mon Sep 17 00:00:00 2001
From: Zach Schira <zach.schira@catalyst.coop>
Date: Thu, 19 Sep 2024 12:00:34 -0400
Subject: [PATCH 070/161] Fix typos in README.rst

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index af06075..948010c 100644
--- a/README.rst
+++ b/README.rst
@@ -75,14 +75,14 @@ use the appropriate executor and supply the job with necessary resources.
 Library
 ^^^^^^^
 There's generic shared tooling for ``pudl-models`` defined in
-``src/mozilla_sec_eia/library/``. This includes the helper fucntions for
+``src/mozilla_sec_eia/library/``. This includes the helper functions for
 constructing dagster jobs discussed above, as well as useful methods for computing
 validation metrics, and an interface to our mlflow tracking server integrated with
 our tracking server.
 
 MlFlow
 """"""
-We use a remote `mlflow tracking <https://mlflow.org/docs/latest/tracking.html>`__ to aide in the
+We use a remote `mlflow tracking <https://mlflow.org/docs/latest/tracking.html>`__ to aid in the
 development and management of ``pudl-models``. In the ``mlflow`` module, there are
 several dagster resources and IO-managers that can be used in any models to allow simple
 seamless interface to the server.

From df4fd095e023cfa7644ea9c19f09777cf3912bc2 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 19 Sep 2024 14:45:07 -0400
Subject: [PATCH 071/161] Cache downloaded layoutlm in dagster home

---
 .../library/mlflow/mlflow_resource.py         |  7 ++++
 .../models/sec10k/utils/layoutlm.py           | 41 +++++++------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
index e3428d7..d0fa62b 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
@@ -12,6 +12,7 @@
 import logging
 import os
 from contextlib import contextmanager
+from pathlib import Path
 
 import mlflow
 from dagster import ConfigurableResource, EnvVar, InitResourceContext
@@ -72,9 +73,15 @@ class MlflowInterface(ConfigurableResource):
     experiment_name: str
     tags: dict = {}
     project: str = EnvVar("GCS_PROJECT")
+    dagster_home: str = EnvVar("DAGSTER_HOME")
 
     _mlflow_run_id: str = PrivateAttr()
 
+    @property
+    def dagster_home_path(self):
+        """Return `dagster_home` as a Path."""
+        return Path(self.dagster_home)
+
     @contextmanager
     def yield_for_execution(
         self,
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
index d358e33..6efc310 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py
@@ -1,26 +1,22 @@
 """Util functions for training and predicting with LayoutLM on Ex. 21 tables."""
 
-import tempfile
-
 import mlflow
-from dagster import ConfigurableResource, InputContext, OutputContext
+from dagster import InputContext, OutputContext
 from PIL import ImageDraw, ImageFont
-from pydantic import PrivateAttr
 from transformers import (
     Trainer,
 )
 
-from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager, MlflowInterface
+from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager
 
 
-def _load_pretrained_layoutlm(version: str = "latest") -> dict:
+def _load_pretrained_layoutlm(cache_path: str, version: str = "latest") -> dict:
     """Function to load layoutlm from mlflow."""
     path = f"models:/layoutlm_extractor/{version}"
 
-    with tempfile.TemporaryDirectory() as dst_path:
-        return mlflow.transformers.load_model(
-            path, dst_path=dst_path, return_type="components"
-        )
+    return mlflow.transformers.load_model(
+        path, dst_path=cache_path, return_type="components"
+    )
 
 
 class LayoutlmIOManager(MlflowBaseIOManager):
@@ -37,23 +33,14 @@ def handle_output(self, context: OutputContext, finetuned_model: Trainer):
 
     def load_input(self, context: InputContext) -> dict:
         """Log metrics to mlflow run/experiment created by `MlflowInterface`."""
-        return _load_pretrained_layoutlm(self.version)
-
-
-class LayoutlmResource(ConfigurableResource):
-    """Dagster resource for loading/using pretrained layoutlm model as a resource."""
-
-    mlflow_interface: MlflowInterface
-    version: str | None = None
-    _model_components: dict = PrivateAttr()
-
-    def setup_for_execution(self, context):
-        """Load layoutlm from mlflow."""
-        self._model_components = _load_pretrained_layoutlm(self.version)
-
-    def get_model_components(self):
-        """Return model components from loaded model."""
-        return self._model_components["model"], self._model_components["tokenizer"]
+        cache_path = (
+            self.mlflow_interface.dagster_home_path / "model_cache" / "layoutlm"
+        )
+        cache_path.mkdir(exist_ok=True, parents=True)
+        return _load_pretrained_layoutlm(
+            cache_path=cache_path,
+            version=self.version,
+        )
 
 
 def normalize_bboxes(txt_df, pg_meta_df):

From 364276559c61d696b656c1640c731299622df475 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 19 Sep 2024 16:40:16 -0400
Subject: [PATCH 072/161] Fix broken test

---
 tests/integration/models/sec10k/extract_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index 26f7fd6..750c969 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -34,6 +34,7 @@ def test_basic_10k_validation(
 
 
 def test_ex21_validation(
+    tmp_path,
     set_test_mlflow_env_vars_factory,
 ):
     """Test ex21_validation_job."""
@@ -42,7 +43,9 @@ def test_ex21_validation(
         os.getenv("MLFLOW_TRACKING_URI"),
         os.getenv("GCS_PROJECT"),
     )
-    pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm()
+    pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm(
+        cache_path=tmp_path
+    )
 
     with unittest.mock.patch(
         "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm",

From 830bd74288d3887129878a35db2e9db081fe286b Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Fri, 20 Sep 2024 08:25:12 +0100
Subject: [PATCH 073/161] fix rename filings

---
 .../sec10k/ex_21/rename_labeled_filings.py    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
index 182dd04..1de455f 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-def rename_filings():
+def rename_filings(labeled_bucket_name="labeledv0.1"):
     """Rename labeled filings in GCS after importing from Label Studio.
 
     After importing labeled documents from Label Studio into GCS the
@@ -22,18 +22,20 @@ def rename_filings():
     filename.
     """
     archive = GCSArchive()
-    bucket = archive._labels_bucket
-
-    labeled_bucket_name = "labeled/"
+    bucket_path = archive.labels_bucket_path / labeled_bucket_name
 
-    for blob in bucket.list_blobs(prefix=labeled_bucket_name):
-        if blob.name != labeled_bucket_name:
-            logger.info(blob.name)
-            file_dict = json.loads(blob.download_as_text())
+    for file in bucket_path.iterdir():
+        filename = file.parts[-1]
+        if filename != labeled_bucket_name:
+            logger.info(filename)
+            file_dict = json.loads(file.read_text())
             archive_name = file_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
-            archive_filepath = f"{labeled_bucket_name}/{archive_name}"
-            logger.info(archive_filepath)
-            bucket.rename_blob(blob, archive_filepath)
+            # check if name uses the old local filing naming schema
+            if len(archive_name.split("-")) == 6:
+                archive_name = "-".join(archive_name.split("-")[2:])
+            new_name = file.with_name(archive_name)
+            logger.info(new_name)
+            file.move(new_name)
 
 
 def copy_labeled_jsons_to_new_version_folder(

From 2cd1fe629a27b591d50145d0b0f87a58b2bc4a8a Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Fri, 20 Sep 2024 12:36:56 +0100
Subject: [PATCH 074/161] fix paths to cache training data

---
 .../models/sec10k/utils/cloud.py              | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 930f44f..64e5dc3 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -55,6 +55,13 @@ def from_10k(cls, filename: str, sec10k_text: str, ex_21_version: str):
 
     def save_as_pdf(self, file: BinaryIO):
         """Save Exhibit 21 as a PDF in `file`, which can be in memory or on disk."""
+        # TODO: probably should make a "corrections" file that has CSS/HTML replacements
+        # to make PDF render
+        # TODO: should probably also catch errors and not fail
+        if "border-bottom: black thin solid;" in self.ex_21_text:
+            self.ex_21_text = self.ex_21_text.replace(
+                "border-bottom: black thin solid;", "border-bottom: 1px solid black;"
+            )
         res = pisa.CreatePDF(self.ex_21_text, file)
         if res.err:
             logger.warning(
@@ -197,9 +204,17 @@ def get_filings(
         filings = []
         for _, filing in filing_selection.iterrows():
             local_path = self.get_local_filename(cache_directory, filing)
+            year_quarter = filing["year_quarter"]
             if not local_path.exists():
                 with local_path.open("w") as f:
-                    f.write((self.filings_bucket_path / filing.filename).read_text())
+                    f.write(
+                        (
+                            self.filings_bucket_path
+                            / "sec10k"
+                            / f"sec10k-{year_quarter}"
+                            / filing.filename
+                        ).read_text()
+                    )
 
             with local_path.open() as f:
                 sec10k_filing = Sec10K.from_file(
@@ -257,11 +272,12 @@ def cache_training_data(
         json_cache_path.mkdir(parents=True, exist_ok=True)
         pdf_cache_path.mkdir(parents=True, exist_ok=True)
         metadata_df = self.get_metadata()
-        label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)")
-
+        # label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)")
+        label_name_pattern = re.compile(r"(\d+)-(.+)")
         # Cache filings and labels
         filenames = []
         direc = self.labels_bucket_path / gcs_folder_name
+        logger.info(direc.is_dir())
         for file in direc.iterdir():
             if file.name == gcs_folder_name:
                 continue
@@ -273,10 +289,11 @@ def cache_training_data(
             match = label_name_pattern.search(file.name)
             filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt")
 
+        metadata_df = metadata_df.reset_index()
         filings = metadata_df[metadata_df["filename"].isin(filenames)]
         self.get_filings(
-            filings,
-            cache_path=pdf_cache_path,
+            filing_selection=filings,
+            cache_directory=pdf_cache_path,
             cache_pdf=True,
         )
 

From 64dc8c580bc8a3554c8aa743116dd9bb7aec5461 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Fri, 20 Sep 2024 13:39:25 +0100
Subject: [PATCH 075/161] update root dir path

---
 .../models/sec10k/ex_21/create_labeled_dataset.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
index 55e1d5a..47d5ee8 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
@@ -16,7 +16,7 @@
 )
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
-ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve()
+ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.resolve()
 
 
 BBOX_COLS_PDF = [

From 226d91ce52696e0c3b33b54023f3c626ee5ee8ef Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 20 Sep 2024 10:10:17 -0400
Subject: [PATCH 076/161] Fix UPath initialization

---
 src/mozilla_sec_eia/models/sec10k/utils/cloud.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 930f44f..493c69a 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -144,17 +144,23 @@ class GCSArchive(ConfigurableResource):
     @property
     def filings_bucket_path(self):
         """Return UPath of filings bucket."""
-        return UPath(self.filings_bucket)
+        path = UPath(self.filings_bucket)
+        assert path.exists(), "Filings bucket path does not exist"
+        return path
 
     @property
     def labels_bucket_path(self):
         """Return UPath of filings bucket."""
-        return UPath(self.labels_bucket)
+        path = UPath(self.labels_bucket)
+        assert path.exists(), "Labels bucket path does not exist"
+        return path
 
     @property
     def outputs_bucket_path(self):
         """Return UPath of filings bucket."""
-        return UPath(self.outputs_bucket)
+        path = UPath(self.outputs_bucket)
+        assert path.exists(), "Outputs bucket path does not exist"
+        return path
 
     def get_metadata(self, year_quarter: str | None = None) -> pd:
         """Return dataframe of filing metadata."""

From 3c17d33a71ca78547e3541453e660fdc1336002e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 20 Sep 2024 10:32:23 -0400
Subject: [PATCH 077/161] Fix path in test

---
 tests/conftest.py                               | 3 ++-
 tests/integration/models/sec10k/extract_test.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index d1a47d4..1339467 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,8 +37,9 @@ def test_dir() -> Path:
 
 
 @pytest.fixture
-def set_test_mlflow_env_vars_factory():
+def set_test_mlflow_env_vars_factory(tmp_path):
     def factory():
+        os.environ["DAGSTER_HOME"] = str(tmp_path)
         # Use in memory tracking backend unless USE_TRACKING_SERVER is set
         if not os.getenv("USE_TRACKING_SERVER"):
             os.environ["MLFLOW_TRACKING_URI"] = "sqlite:///:memory:"
diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index 750c969..88e5415 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -49,7 +49,7 @@ def test_ex21_validation(
 
     with unittest.mock.patch(
         "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm",
-        new=lambda _: pretrained_model,
+        new=lambda cache_path, version: pretrained_model,
     ):
         set_test_mlflow_env_vars_factory()
         result = sec10k.defs.get_job_def(

From df69f42c222c4f1de7dcbf3de66b73ca0b83b01b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 20 Sep 2024 11:23:03 -0400
Subject: [PATCH 078/161] Create huggingface dataset outside model execution

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |   5 -
 .../models/sec10k/ex_21/__init__.py           |  49 +-----
 .../models/sec10k/ex_21/inference.py          | 140 ++++++++----------
 3 files changed, 68 insertions(+), 126 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 4148007..fd9a866 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -45,10 +45,6 @@
     ex_21.validation_assets,
 )
 
-ex21_test_job = model_jobs.create_validation_model_job(
-    "ex21_test", [ex_21.test_extraction_metrics]
-)
-
 layoutlm_finetune_job = model_jobs.create_training_job(
     "layoutlm_finetune",
     layoutlm_assets,
@@ -62,7 +58,6 @@
         basic_10k_validation_job,
         ex21_production_job,
         ex21_validation_job,
-        ex21_test_job,
         layoutlm_finetune_job,
     ],
     resources={
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 890f44e..ce971c5 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1,11 +1,9 @@
 """Module for working with exhibit 21 data."""
 
 import logging
-import traceback
 
 import mlflow
 import pandas as pd
-import torch
 from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op
 
 from mozilla_sec_eia.library import validation_helpers
@@ -19,7 +17,7 @@
 )
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
-from .inference import Exhibit21Extractor, clean_extracted_df
+from .inference import Exhibit21Extractor, clean_extracted_df, extract_filings
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -147,25 +145,6 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame):
     return validation_df
 
 
-@asset
-def test_extraction_metrics(
-    cloud_interface: GCSArchive,
-    exhibit21_extractor: Exhibit21Extractor,
-    mlflow_interface: MlflowInterface,
-):
-    """Run extraction with various numbers of filings to view resource usage."""
-    filings = cloud_interface.get_metadata()
-    for num_filings in [8, 16, 32, 64, 128]:
-        with mlflow.start_run(
-            run_name=f"extract_{num_filings}_filings",
-            nested=True,
-            parent_run_id=mlflow_interface.mlflow_run_id,
-            experiment_id=MlflowInterface.get_or_create_experiment("ex21_test"),
-        ):
-            mlflow.log_param("num_filings", num_filings)
-            exhibit21_extractor.extract_filings(filings.sample(num_filings))
-
-
 @op(
     out={
         "metadata": Out(dagster_type=sec10k_extract_metadata_type),
@@ -178,24 +157,7 @@ def extract_filing_chunk(
     layoutlm,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
-    try:
-        metadata, extracted = exhibit21_extractor.extract_filings(
-            filings,
-            model=layoutlm["model"],
-            processor=layoutlm["tokenizer"],
-        )
-    except (torch.OutOfMemoryError, RuntimeError):
-        logger.warning(traceback.format_exc())
-        logger.warning(f"Error while extracting filings: {filings['filename']}")
-        metadata = pd.DataFrame(
-            {
-                "filename": filings["filename"],
-                "success": [False] * len(filings),
-                "notes": ["Out of memory error"] * len(filings),
-            }
-        ).set_index("filename")
-        extracted = Ex21CompanyOwnership.example(size=0)
-    return metadata, extracted
+    return extract_filings(exhibit21_extractor, filings, layoutlm)
 
 
 @op(
@@ -272,12 +234,9 @@ def ex21_extract_validation(
     layoutlm,
 ):
     """Extract ownership info from exhibit 21 docs."""
-    metadata, extracted = exhibit21_extractor.extract_filings(
-        ex21_validation_filing_metadata,
-        model=layoutlm["model"],
-        processor=layoutlm["tokenizer"],
+    return extract_filings(
+        exhibit21_extractor, ex21_validation_filing_metadata, layoutlm
     )
-    return metadata, extracted
 
 
 exhibit_21_extractor_resource = Exhibit21Extractor(
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index ff4be90..d69ddb5 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import tempfile
+import traceback
 from contextlib import contextmanager
 from pathlib import Path
 
@@ -17,7 +18,7 @@
 )
 from transformers.tokenization_utils_base import BatchEncoding
 
-from ..entities import Ex21CompanyOwnership
+from ..entities import Ex21CompanyOwnership, Sec10kExtractionMetadata
 from ..utils.cloud import GCSArchive, get_metadata_filename
 from ..utils.layoutlm import (
     get_id_label_conversions,
@@ -70,15 +71,33 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
     return inference_df
 
 
-def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=False):
+def create_inference_dataset(
+    filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False
+) -> tuple[pd.DataFrame, Dataset]:
     """Create a Hugging Face Dataset from PDFs for inference."""
-    if has_labels:
-        inference_df = format_label_studio_output(
-            labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir
+    filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()]
+
+    # Parse PDFS
+    with (
+        tempfile.TemporaryDirectory() as pdfs_dir,
+        tempfile.TemporaryDirectory() as labeled_json_dir,
+    ):
+        pdfs_dir = Path(pdfs_dir)
+        labeled_json_dir = Path(labeled_json_dir)
+
+        extraction_metadata = _cache_pdfs(
+            filings_with_ex21,
+            cloud_interface=cloud_interface,
+            pdf_dir=pdfs_dir,
         )
-    else:
-        inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
-    image_dict = get_image_dict(pdfs_dir)
+        if has_labels:
+            inference_df = format_label_studio_output(
+                labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir
+            )
+        else:
+            inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
+        image_dict = get_image_dict(pdfs_dir)
+
     annotations = []
     for filename in image_dict:
         annotation = {
@@ -96,7 +115,7 @@ def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=F
         annotations.append(annotation)
 
     dataset = Dataset.from_list(annotations)
-    return dataset
+    return extraction_metadata, dataset
 
 
 def clean_extracted_df(extracted_df):
@@ -239,73 +258,9 @@ def setup_for_execution(self, context):
         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     def extract_filings(
-        self, filing_metadata: pd.DataFrame, model, processor
+        self, dataset: Dataset, model, processor
     ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Predict entities with a fine-tuned model and extract Ex. 21 tables.
-
-        This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir`
-        that the model can then perform inference on (`create_inference_dataset`).
-        Then it creates an instance of the custom LayoutLM inference pipeline and
-        runs the dataset through the pipeline. The pipeline outputs logits, predictions,
-        and an output dataframe with extracted Ex. 21 table.
-
-        Arguments:
-            pdfs_dir: Path to the directory with PDFs that are being used for inference.
-            model: A fine-tuned LayoutLM model.
-            processor: The tokenizer and encoder for model inputs.
-            extraction_metadata: A dataframe to track extraction success metrics. Should
-                have columns 'filename' and 'success'.
-            dataset_ind: A list of index numbers of dataset records to be used for inference
-                Default is None, in which the entire dataset created from the PDF directory
-                is used.
-            labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot
-                be None if has_labels is True.
-            has_labels: Boolean, true if the data has associated labels that can be used in
-                visualizing and validating results.
-            device: String or int, specify what computation device to use for inference
-                i.e. "mps", "cpu", "cuda"
-
-        Returns:
-            logits: A list of logits. The list is the length of the number of documents in the
-                dataset (number of PDFs in pdfs_dir). Each logit object in the list is of
-                shape (batch_size, seq_len, num_labels). Seq_len is
-                the same as token length (512 in this case).
-            predictions: A list of predictions. The list is the length of the number of documents
-                in the dataset (number of PDFs in pdfs_dir).
-                From the logits, we take the highest score for each token, using argmax.
-                This serves as the predicted label for each token. It is shape (seq_len) or token
-                length.
-            output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column
-                that is the filename of the extracted Ex. 21. Dataframe contains columns id,
-                subsidiary, loc, own_per.
-        """
-        filings_with_ex21 = filing_metadata[
-            ~filing_metadata["exhibit_21_version"].isna()
-        ]
-
-        with (
-            tempfile.TemporaryDirectory() as pdf_dir,
-            tempfile.TemporaryDirectory() as labeled_json_dir,
-        ):
-            extraction_metadata = _cache_pdfs(
-                filings_with_ex21,
-                cloud_interface=self.cloud_interface,
-                pdf_dir=pdf_dir,
-            )
-            try:
-                dataset = create_inference_dataset(
-                    pdfs_dir=Path(pdf_dir),
-                    labeled_json_dir=labeled_json_dir,
-                    has_labels=self.has_labels,
-                )
-            # TODO: Investigate failures in creating dataset
-            except KeyError:
-                logger.warning("Failed to create inference dataset!")
-                extraction_metadata.loc[:, "success"] = False
-                extraction_metadata.loc[:, "notes"] = (
-                    "Failed to create inference dataset."
-                )
-                return extraction_metadata, Ex21CompanyOwnership.example(size=0)
+        """Predict entities with a fine-tuned model and extract Ex. 21 tables."""
         if self.dataset_ind:
             dataset = dataset.select(self.dataset_ind)
 
@@ -320,7 +275,8 @@ def extract_filings(
 
         logits = []
         predictions = []
-        all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"])
+        all_output_df = Ex21CompanyOwnership.example(size=0)
+        extraction_metadata = Sec10kExtractionMetadata.example(size=0)
         for logit, pred, output_df in pipe(_get_data(dataset)):
             logits.append(logit)
             predictions.append(pred)
@@ -335,6 +291,38 @@ def extract_filings(
         return extraction_metadata, all_output_df
 
 
+def extract_filings(
+    exhibit21_extractor: Exhibit21Extractor,
+    filings: pd.DataFrame,
+    layoutlm,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Create huggingface dataset from filings and perform extraction."""
+    try:
+        failed_metadata, dataset = create_inference_dataset(
+            filing_metadata=filings,
+            cloud_interface=exhibit21_extractor.cloud_interface,
+            has_labels=exhibit21_extractor.has_labels,
+        )
+        metadata, extracted = exhibit21_extractor.extract_filings(
+            dataset,
+            model=layoutlm["model"],
+            processor=layoutlm["tokenizer"],
+        )
+        metadata = pd.concat([failed_metadata, metadata])
+    except Exception as e:
+        logger.warning(traceback.format_exc())
+        logger.warning(f"Error while extracting filings: {filings.index}")
+        metadata = pd.DataFrame(
+            {
+                "filename": filings.index,
+                "success": [False] * len(filings),
+                "notes": [str(e)] * len(filings),
+            }
+        ).set_index("filename")
+        extracted = Ex21CompanyOwnership.example(size=0)
+    return metadata, extracted
+
+
 class LayoutLMInferencePipeline(Pipeline):
     """Pipeline for performing inference with fine-tuned LayoutLM."""
 

From 2d3345cc7f9a3e6f3570c0ec1ea807afd5777d47 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Fri, 20 Sep 2024 17:23:27 +0100
Subject: [PATCH 079/161] small fixes to path handling

---
 labeled_data_tracking.csv                     | 259 +++++++++++-------
 .../sec10k/ex_21/create_labeled_dataset.py    |   9 +-
 2 files changed, 169 insertions(+), 99 deletions(-)

diff --git a/labeled_data_tracking.csv b/labeled_data_tracking.csv
index 31a7950..0e9f642 100644
--- a/labeled_data_tracking.csv
+++ b/labeled_data_tracking.csv
@@ -1,97 +1,162 @@
-CIK,Filename,Initials,Notes
-107815,edgar/data/107815/0000107815-17-000106.txt,KL
-354707,edgar/data/354707/0000354707-19-000043.txt,KL
-61339,edgar/data/61339/0001161728-17-000004.txt,KL
-1317577,edgar/data/1317577/0001193125-13-356794.txt,KL
-59527,edgar/data/59527/0000059527-20-000007.txt,KL
-40545,edgar/data/40545/0000040545-04-000013.txt,KL
-84557,edgar/data/84557/0001046861-06-000007.txt,KL
-100826,edgar/data/100826/0001193125-09-042636.txt,KL
-81033,edgar/data/81033/0000950117-06-000927.txt,KL
-4904,edgar/data/4904/0000004904-09-000040.txt,KL
-46207,edgar/data/46207/0001104659-13-011461.txt,KL
-205402,edgar/data/205402/0000950114-99-000043.txt,KL
-77227,edgar/data/77227/0001031296-09-000008.txt,KL
-92487,edgar/data/92487/0000004904-21-000010.txt,KL
-922237,edgar/data/922237/0000950005-99-000915.txt,KL
-106170,edgar/data/106170/0000072741-98-000076.txt,KL
-1223037,edgar/data/1223037/0001193125-09-249998.txt,KL
-3146,edgar/data/3146/0001193125-06-055140.txt,KL
-932628,edgar/data/932628/0000932628-16-000045.txt,KL
-804212,edgar/data/804212/0000804212-14-000014.txt,KL
-92416,edgar/data/92416/0000892569-94-000102.txt,KL
-38079,edgar/data/38079/0001558370-16-004332.txt,KL
-933157,edgar/data/933157/0001144204-08-021779.txt,KL
-869495,edgar/data/869495/0001144204-13-002380.txt,KL
-80812,edgar/data/80812/0000927016-98-004349.txt,KL
-1582244,edgar/data/1582244/0001582244-16-000187.txt,KL
-1166847,edgar/data/1166847/0001117768-12-000118.txt,KL
-86521,edgar/data/86521/0000086521-10-000019.txt,KL
-1012493,edgar/data/1012493/0000922358-99-000021.txt,KL
-1170154,edgar/data/1170154/0001193125-11-062378.txt,KL
-1140414,edgar/data/1140414/0001387131-16-004912.txt,KL
-1158053,edgar/data/1158053/0000893220-04-001186.txt,KL
-71675,edgar/data/71675/0001046861-02-000012.txt,KL
-96271,edgar/data/96271/0001193125-07-042781.txt,KL
-710182,edgar/data/710182/0000930661-97-000576.txt,KL
-1029528,edgar/data/1029528/0001193125-04-043994.txt,KL
-1043186,edgar/data/1043186/0001564590-19-011739.txt,KL
-9342,edgar/data/9342/0000009342-95-000008.txt,KL
-18647,edgar/data/18647/0001169232-08-000603.txt,KL
-20947,edgar/data/20947/0001031296-06-000044.txt,KL
-916529,edgar/data/916529/0001144204-03-001333.txt,KL
-60549,edgar/data/60549/0001047469-98-012481.txt,KL
-38725,edgar/data/38725/0000038725-17-000042.txt,KL
-100122,edgar/data/100122/0000941138-03-000007.txt,KL
-355811,edgar/data/355811/0000355811-18-000009.txt,KL
-1039065,edgar/data/1039065/0001558370-15-001687.txt,KL
-1008654,edgar/data/1008654/0001008654-20-000018.txt,KL
-9534,edgar/data/9534/0000897069-05-000574.txt
-1085866,edgar/data/1085866/0001072613-06-000748.txt
-1045425,edgar/data/1045425/0000893220-05-000599.txt
-1090908,edgar/data/1090908/0001437749-16-034757.txt
-1546640,edgar/data/1546640/0001546640-14-000023.txt
-844143,edgar/data/844143/0001104659-07-008735.txt
-722056,edgar/data/722056/0001012870-99-002106.txt
-1599298,edgar/data/1599298/0001599298-21-000011.txt
-1010961,edgar/data/1010961/0001010961-01-500013.txt
-802781,edgar/data/802781/0000950116-97-000760.txt
-742126,edgar/data/742126/0001015402-05-001005.txt
-930835,edgar/data/930835/0001047469-04-007773.txt
-1174922,edgar/data/1174922/0001193125-10-043336.txt
-1433270,edgar/data/1433270/0001047469-14-001424.txt
-1275229,edgar/data/1275229/0001558370-19-002331.txt
-18230,edgar/data/18230/0000950131-98-002084.txt
-940942,edgar/data/940942/0001564590-21-009409.txt
-320575,edgar/data/320575/0001193125-07-117419.txt
-78778,edgar/data/78778/0000078778-97-000019.txt
-1627811,edgar/data/1627811/0001493152-19-004568.txt
-78890,edgar/data/78890/0000078890-14-000004.txt
-99250,edgar/data/99250/0000099250-00-000002.txt
-78100,edgar/data/78100/0001109357-20-000053.txt
-700949,edgar/data/700949/0000892626-96-000081.txt
-1468174,edgar/data/1468174/0001468174-21-000011.txt
-805730,edgar/data/805730/0001104659-05-009806.txt
-820242,edgar/data/820242/0000912057-01-517770.txt
-52795,edgar/data/52795/0000950137-00-000865.txt
-944130,edgar/data/944130/0001432093-11-000164.txt
-66901,edgar/data/66901/0000065984-96-000046.txt
-722077,edgar/data/722077/0001047469-15-002056.txt
-103872,edgar/data/103872/0001193125-13-444053.txt
-1065201,edgar/data/1065201/0001193125-10-070085.txt
-729213,edgar/data/729213/0001038838-01-000141.txt
-1383414,edgar/data/1383414/0001193125-14-409216.txt
-1493594,edgar/data/1493594/0001493594-19-000064.txt
-1039399,edgar/data/1039399/0001039399-20-000011.txt
-943452,edgar/data/943452/0001193125-07-043570.txt
-944739,edgar/data/944739/0001193125-06-035399.txt
-61986,edgar/data/61986/0000061986-99-000003.txt
-6769,edgar/data/6769/0000950129-03-001523.txt
-319201,edgar/data/319201/0000891618-98-004336.txt
-34067,edgar/data/34067/0001104659-06-016592.txt
-1265245,edgar/data/1265245/0000770944-04-000004.txt
-1066134,edgar/data/1066134/0001193125-08-186978.txt
-789570,edgar/data/789570/0000898430-95-000343.txt
-1273013,edgar/data/1273013/0001104659-07-020456.txt
-88205,edgar/data/88205/0000950168-03-000755.txt
-1286613,edgar/data/1286613/0001140361-18-012880.txt
+,CIK,Filename,Initials,Notes
+0,107815,edgar/data/107815/0000107815-17-000106.txt,KL,
+1,354707,edgar/data/354707/0000354707-19-000043.txt,KL,
+2,61339,edgar/data/61339/0001161728-17-000004.txt,KL,
+3,1317577,edgar/data/1317577/0001193125-13-356794.txt,KL,
+4,59527,edgar/data/59527/0000059527-20-000007.txt,KL,
+5,40545,edgar/data/40545/0000040545-04-000013.txt,KL,
+6,84557,edgar/data/84557/0001046861-06-000007.txt,KL,
+7,100826,edgar/data/100826/0001193125-09-042636.txt,KL,
+8,81033,edgar/data/81033/0000950117-06-000927.txt,KL,
+9,4904,edgar/data/4904/0000004904-09-000040.txt,KL,
+10,46207,edgar/data/46207/0001104659-13-011461.txt,KL,
+11,205402,edgar/data/205402/0000950114-99-000043.txt,KL,
+12,77227,edgar/data/77227/0001031296-09-000008.txt,KL,
+13,92487,edgar/data/92487/0000004904-21-000010.txt,KL,
+14,922237,edgar/data/922237/0000950005-99-000915.txt,KL,
+15,106170,edgar/data/106170/0000072741-98-000076.txt,KL,
+16,1223037,edgar/data/1223037/0001193125-09-249998.txt,KL,
+17,3146,edgar/data/3146/0001193125-06-055140.txt,KL,
+18,932628,edgar/data/932628/0000932628-16-000045.txt,KL,
+19,804212,edgar/data/804212/0000804212-14-000014.txt,KL,
+20,92416,edgar/data/92416/0000892569-94-000102.txt,KL,
+21,38079,edgar/data/38079/0001558370-16-004332.txt,KL,
+22,933157,edgar/data/933157/0001144204-08-021779.txt,KL,
+23,869495,edgar/data/869495/0001144204-13-002380.txt,KL,
+24,80812,edgar/data/80812/0000927016-98-004349.txt,KL,
+25,1582244,edgar/data/1582244/0001582244-16-000187.txt,KL,
+26,1166847,edgar/data/1166847/0001117768-12-000118.txt,KL,
+27,86521,edgar/data/86521/0000086521-10-000019.txt,KL,
+28,1012493,edgar/data/1012493/0000922358-99-000021.txt,KL,
+29,1170154,edgar/data/1170154/0001193125-11-062378.txt,KL,
+30,1140414,edgar/data/1140414/0001387131-16-004912.txt,KL,
+31,1158053,edgar/data/1158053/0000893220-04-001186.txt,KL,
+32,71675,edgar/data/71675/0001046861-02-000012.txt,KL,
+33,96271,edgar/data/96271/0001193125-07-042781.txt,KL,
+34,710182,edgar/data/710182/0000930661-97-000576.txt,KL,
+35,1029528,edgar/data/1029528/0001193125-04-043994.txt,KL,
+36,1043186,edgar/data/1043186/0001564590-19-011739.txt,KL,
+37,9342,edgar/data/9342/0000009342-95-000008.txt,KL,
+38,18647,edgar/data/18647/0001169232-08-000603.txt,KL,
+39,20947,edgar/data/20947/0001031296-06-000044.txt,KL,
+40,916529,edgar/data/916529/0001144204-03-001333.txt,KL,
+41,60549,edgar/data/60549/0001047469-98-012481.txt,KL,
+42,38725,edgar/data/38725/0000038725-17-000042.txt,KL,
+43,100122,edgar/data/100122/0000941138-03-000007.txt,KL,
+44,355811,edgar/data/355811/0000355811-18-000009.txt,KL,
+45,1039065,edgar/data/1039065/0001558370-15-001687.txt,KL,
+46,1008654,edgar/data/1008654/0001008654-20-000018.txt,KL,
+47,9534,edgar/data/9534/0000897069-05-000574.txt,KL,
+48,1085866,edgar/data/1085866/0001072613-06-000748.txt,KL,
+49,1045425,edgar/data/1045425/0000893220-05-000599.txt,KL,
+50,1090908,edgar/data/1090908/0001437749-16-034757.txt,KL,
+51,1546640,edgar/data/1546640/0001546640-14-000023.txt,KL,
+52,844143,edgar/data/844143/0001104659-07-008735.txt,KL,
+53,1599298,edgar/data/1599298/0001599298-21-000011.txt,KL,
+54,1010961,edgar/data/1010961/0001010961-01-500013.txt,KL,
+55,802781,edgar/data/802781/0000950116-97-000760.txt,KL,
+56,742126,edgar/data/742126/0001015402-05-001005.txt,KL,
+57,1174922,edgar/data/1174922/0001193125-10-043336.txt,KL,
+58,1433270,edgar/data/1433270/0001047469-14-001424.txt,KL,
+59,1275229,edgar/data/1275229/0001558370-19-002331.txt,KL,
+60,940942,edgar/data/940942/0001564590-21-009409.txt,KL,
+61,320575,edgar/data/320575/0001193125-07-117419.txt,KL,
+62,78778,edgar/data/78778/0000078778-97-000019.txt,KL,
+63,1627811,edgar/data/1627811/0001493152-19-004568.txt,KL,
+64,99250,edgar/data/99250/0000099250-00-000002.txt,KL,
+65,700949,edgar/data/700949/0000892626-96-000081.txt,KL,
+66,805730,edgar/data/805730/0001104659-05-009806.txt,KL,
+67,820242,edgar/data/820242/0000912057-01-517770.txt,KL,
+68,944130,edgar/data/944130/0001432093-11-000164.txt,KL,
+69,66901,edgar/data/66901/0000065984-96-000046.txt,KL,
+70,722077,edgar/data/722077/0001047469-15-002056.txt,KL,
+71,103872,edgar/data/103872/0001193125-13-444053.txt,KL,
+72,1065201,edgar/data/1065201/0001193125-10-070085.txt,KL,
+73,729213,edgar/data/729213/0001038838-01-000141.txt,KL,
+74,1383414,edgar/data/1383414/0001193125-14-409216.txt,KL,
+75,1493594,edgar/data/1493594/0001493594-19-000064.txt,KL,
+76,1039399,edgar/data/1039399/0001039399-20-000011.txt,KL,
+77,943452,edgar/data/943452/0001193125-07-043570.txt,KL,
+78,944739,edgar/data/944739/0001193125-06-035399.txt,KL,
+79,61986,edgar/data/61986/0000061986-99-000003.txt,KL,
+80,319201,edgar/data/319201/0000891618-98-004336.txt,KL,
+81,34067,edgar/data/34067/0001104659-06-016592.txt,KL,
+82,1265245,edgar/data/1265245/0000770944-04-000004.txt,KL,
+83,1066134,edgar/data/1066134/0001193125-08-186978.txt,KL,
+84,789570,edgar/data/789570/0000898430-95-000343.txt,KL,
+85,1273013,edgar/data/1273013/0001104659-07-020456.txt,KL,
+86,88205,edgar/data/88205/0000950168-03-000755.txt,KL,
+87,1286613,edgar/data/1286613/0001140361-18-012880.txt,KL,
+88,92416,edgar/data/92416/0001193125-17-062419.txt,KL,
+89,68589,edgar/data/68589/0000068589-11-000002.txt,KL,
+90,1738827,edgar/data/1738827/0001558370-19-002349.txt,KL,
+91,1283140,edgar/data/1283140/0000950134-08-002891.txt,KL,
+92,72903,edgar/data/72903/0001104659-07-013272.txt,KL,
+93,104819,edgar/data/104819/0001193125-14-422013.txt,KL,
+94,872248,edgar/data/872248/0000950123-10-018189.txt,KL,
+95,1035002,edgar/data/1035002/0001035002-19-000008.txt,KL,
+96,1300514,edgar/data/1300514/0000950123-12-004305.txt,KL,
+98,1868941,edgar/data/1868941/0001868941-22-000120.txt,KL,
+99,1004155,edgar/data/1004155/0000092122-23-000012.txt,KL,
+100,1013871,edgar/data/1013871/0000950123-08-002271.txt,KL,
+101,202584,edgar/data/202584/0000065984-10-000035.txt,KL,
+102,1573166,edgar/data/1573166/0001047469-16-010925.txt,KL,
+103,1106935,edgar/data/1106935/0000945234-03-000135.txt,KL,
+104,70145,edgar/data/70145/0001193125-11-321222.txt,KL,
+105,32689,edgar/data/32689/0001047469-09-001643.txt,KL,
+106,20290,edgar/data/20290/0001326160-19-000057.txt,KL,
+107,1581552,edgar/data/1581552/0001185185-22-000284.txt,KL,
+108,866829,edgar/data/866829/0000866829-12-000009.txt,KL,
+109,1361937,edgar/data/1361937/0001144204-14-012672.txt,KL,
+110,721693,edgar/data/721693/0001213900-19-005898.txt,KL,
+111,1445146,edgar/data/1445146/0001445146-16-000019.txt,KL,
+112,700997,edgar/data/700997/0000950134-96-000227.txt,KL,
+113,1040736,edgar/data/1040736/0001040736-05-000005.txt,KL,
+114,315189,edgar/data/315189/0000940180-02-001741.txt,KL,
+115,6314,edgar/data/6314/0000891092-07-004187.txt,KL,
+116,1069157,edgar/data/1069157/0001047469-04-007313.txt,KL,
+117,1436161,edgar/data/1436161/0001553350-15-000363.txt,KL,
+118,1047098,edgar/data/1047098/0001193125-04-161613.txt,KL,
+119,842635,edgar/data/842635/0000899243-98-000530.txt,KL,
+120,29644,edgar/data/29644/0000897101-11-000318.txt,KL,
+121,804269,edgar/data/804269/0001193125-06-188013.txt,KL,
+122,46738,edgar/data/46738/0000950131-01-001406.txt,KL,
+123,99780,edgar/data/99780/0000099780-20-000025.txt,KL,
+124,822662,edgar/data/822662/0000822662-18-000021.txt,KL,
+125,108516,edgar/data/108516/0001564590-19-027053.txt,KL,
+127,1430306,edgar/data/1430306/0001387131-20-003189.txt,KL,
+128,909413,edgar/data/909413/0000950129-99-001323.txt,KL,
+129,725058,edgar/data/725058/0000950109-97-002390.txt,KL,
+130,1668370,edgar/data/1668370/0001575872-22-000280.txt,KL,
+131,1035688,edgar/data/1035688/0000950123-10-025544.txt,KL,
+132,865911,edgar/data/865911/0000891554-01-501498.txt,KL,
+133,790708,edgar/data/790708/0000950168-03-001327.txt,KL,
+134,881665,edgar/data/881665/0001193125-07-062920.txt,KL,
+135,1440799,edgar/data/1440799/0001144204-11-045638.txt,KL,
+136,32604,edgar/data/32604/0000032604-97-000015.txt,KL,
+137,1421517,edgar/data/1421517/0000950123-10-024776.txt,KL,
+138,1032208,edgar/data/1032208/0000086521-16-000091.txt,KL,
+139,936340,edgar/data/936340/0000950123-11-015771.txt,KL,
+140,728385,edgar/data/728385/0001477932-16-009335.txt,KL,
+141,731802,edgar/data/731802/0000731802-19-000037.txt,KL,
+142,30371,edgar/data/30371/0001326160-17-000016.txt,KL,
+143,1623360,edgar/data/1623360/0001640334-18-002417.txt,KL,
+144,57183,edgar/data/57183/0001068800-04-000659.txt,KL,
+145,43350,edgar/data/43350/0001144204-17-014878.txt,KL,
+146,319019,edgar/data/319019/0000319019-96-000013.txt,KL,
+147,726435,edgar/data/726435/0001406774-11-000031.txt,KL,
+148,811156,edgar/data/811156/0001047469-13-001373.txt,KL,
+149,1368802,edgar/data/1368802/0001144204-07-014837.txt,KL,
+150,1273441,edgar/data/1273441/0001273441-13-000012.txt,KL,
+152,1135338,edgar/data/1135338/0000950116-05-001941.txt,KL,
+153,1375063,edgar/data/1375063/0001144204-14-055659.txt,KL,
+154,103682,edgar/data/103682/0001193125-05-038710.txt,KL,
+155,1081316,edgar/data/1081316/0001081316-14-000006.txt,KL,
+156,54507,edgar/data/54507/0000054507-18-000012.txt,KL,
+157,1130310,edgar/data/1130310/0001130310-16-000031.txt,KL,
+158,884504,edgar/data/884504/0001144204-10-017335.txt,KL,
+159,1555177,edgar/data/1555177/0001555177-17-000011.txt,KL,
+160,1142129,edgar/data/1142129/0001493152-17-005793.txt,KL,
+161,1059025,edgar/data/1059025/0000934665-99-000002.txt,KL,
+162,318996,edgar/data/318996/0000318996-18-000007.txt,KL,
+163,350563,edgar/data/350563/0001193125-12-078254.txt,KL,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
index 47d5ee8..39ac161 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
@@ -140,8 +140,6 @@ def _is_cik_in_training_data(labeled_json_filename, tracking_df):
     return cik in tracking_df.CIK.unique()
 
 
-# TODO: make this work with GCS input directory not local
-# TODO: have default paths?
 def format_label_studio_output(
     labeled_json_dir=ROOT_DIR / "sec10k_filings/labeled_jsons",
     pdfs_dir=ROOT_DIR / "sec10k_filings/pdfs",
@@ -157,6 +155,9 @@ def format_label_studio_output(
         with Path.open(json_file_path) as j:
             doc_dict = json.loads(j.read())
             filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
+            # check if old local naming schema is being used
+            if len(filename.split("-")) == 6:
+                filename = "-".join(filename.split("-")[2:])
             if not _is_cik_in_training_data(filename, tracking_df=tracking_df):
                 continue
             pdf_filename = filename + ".pdf"
@@ -180,6 +181,10 @@ def format_label_studio_output(
             # combine the bounding boxes for each word
             doc_df = doc_df.groupby(level=0).first()
             txt.loc[:, "id"] = filename
+            # TODO: probably want to filter out these empty Ex. 21 docs
+            # the doc might not have any labels in it if it was an empty Ex. 21
+            if "labels" not in doc_df:
+                doc_df.loc[:, "labels"] = pd.Series()
             output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)
             labeled_df = pd.concat([labeled_df, output_df])
 

From 6f9d34a0ee90dbe08cea04b3cd319f7dccbe6f95 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 23 Sep 2024 10:22:14 -0400
Subject: [PATCH 080/161] Minor fixes

---
 labeled_data_tracking.csv                     | 97 -------------------
 .../library/mlflow/__init__.py                |  1 +
 .../library/mlflow/mlflow_resource.py         |  7 +-
 .../library/validation_helpers.py             | 12 +++
 .../models/sec10k/utils/cloud.py              | 24 +++--
 tests/unit/models/sec10k/utils_test.py        |  4 +-
 6 files changed, 32 insertions(+), 113 deletions(-)
 delete mode 100644 labeled_data_tracking.csv

diff --git a/labeled_data_tracking.csv b/labeled_data_tracking.csv
deleted file mode 100644
index 31a7950..0000000
--- a/labeled_data_tracking.csv
+++ /dev/null
@@ -1,97 +0,0 @@
-CIK,Filename,Initials,Notes
-107815,edgar/data/107815/0000107815-17-000106.txt,KL
-354707,edgar/data/354707/0000354707-19-000043.txt,KL
-61339,edgar/data/61339/0001161728-17-000004.txt,KL
-1317577,edgar/data/1317577/0001193125-13-356794.txt,KL
-59527,edgar/data/59527/0000059527-20-000007.txt,KL
-40545,edgar/data/40545/0000040545-04-000013.txt,KL
-84557,edgar/data/84557/0001046861-06-000007.txt,KL
-100826,edgar/data/100826/0001193125-09-042636.txt,KL
-81033,edgar/data/81033/0000950117-06-000927.txt,KL
-4904,edgar/data/4904/0000004904-09-000040.txt,KL
-46207,edgar/data/46207/0001104659-13-011461.txt,KL
-205402,edgar/data/205402/0000950114-99-000043.txt,KL
-77227,edgar/data/77227/0001031296-09-000008.txt,KL
-92487,edgar/data/92487/0000004904-21-000010.txt,KL
-922237,edgar/data/922237/0000950005-99-000915.txt,KL
-106170,edgar/data/106170/0000072741-98-000076.txt,KL
-1223037,edgar/data/1223037/0001193125-09-249998.txt,KL
-3146,edgar/data/3146/0001193125-06-055140.txt,KL
-932628,edgar/data/932628/0000932628-16-000045.txt,KL
-804212,edgar/data/804212/0000804212-14-000014.txt,KL
-92416,edgar/data/92416/0000892569-94-000102.txt,KL
-38079,edgar/data/38079/0001558370-16-004332.txt,KL
-933157,edgar/data/933157/0001144204-08-021779.txt,KL
-869495,edgar/data/869495/0001144204-13-002380.txt,KL
-80812,edgar/data/80812/0000927016-98-004349.txt,KL
-1582244,edgar/data/1582244/0001582244-16-000187.txt,KL
-1166847,edgar/data/1166847/0001117768-12-000118.txt,KL
-86521,edgar/data/86521/0000086521-10-000019.txt,KL
-1012493,edgar/data/1012493/0000922358-99-000021.txt,KL
-1170154,edgar/data/1170154/0001193125-11-062378.txt,KL
-1140414,edgar/data/1140414/0001387131-16-004912.txt,KL
-1158053,edgar/data/1158053/0000893220-04-001186.txt,KL
-71675,edgar/data/71675/0001046861-02-000012.txt,KL
-96271,edgar/data/96271/0001193125-07-042781.txt,KL
-710182,edgar/data/710182/0000930661-97-000576.txt,KL
-1029528,edgar/data/1029528/0001193125-04-043994.txt,KL
-1043186,edgar/data/1043186/0001564590-19-011739.txt,KL
-9342,edgar/data/9342/0000009342-95-000008.txt,KL
-18647,edgar/data/18647/0001169232-08-000603.txt,KL
-20947,edgar/data/20947/0001031296-06-000044.txt,KL
-916529,edgar/data/916529/0001144204-03-001333.txt,KL
-60549,edgar/data/60549/0001047469-98-012481.txt,KL
-38725,edgar/data/38725/0000038725-17-000042.txt,KL
-100122,edgar/data/100122/0000941138-03-000007.txt,KL
-355811,edgar/data/355811/0000355811-18-000009.txt,KL
-1039065,edgar/data/1039065/0001558370-15-001687.txt,KL
-1008654,edgar/data/1008654/0001008654-20-000018.txt,KL
-9534,edgar/data/9534/0000897069-05-000574.txt
-1085866,edgar/data/1085866/0001072613-06-000748.txt
-1045425,edgar/data/1045425/0000893220-05-000599.txt
-1090908,edgar/data/1090908/0001437749-16-034757.txt
-1546640,edgar/data/1546640/0001546640-14-000023.txt
-844143,edgar/data/844143/0001104659-07-008735.txt
-722056,edgar/data/722056/0001012870-99-002106.txt
-1599298,edgar/data/1599298/0001599298-21-000011.txt
-1010961,edgar/data/1010961/0001010961-01-500013.txt
-802781,edgar/data/802781/0000950116-97-000760.txt
-742126,edgar/data/742126/0001015402-05-001005.txt
-930835,edgar/data/930835/0001047469-04-007773.txt
-1174922,edgar/data/1174922/0001193125-10-043336.txt
-1433270,edgar/data/1433270/0001047469-14-001424.txt
-1275229,edgar/data/1275229/0001558370-19-002331.txt
-18230,edgar/data/18230/0000950131-98-002084.txt
-940942,edgar/data/940942/0001564590-21-009409.txt
-320575,edgar/data/320575/0001193125-07-117419.txt
-78778,edgar/data/78778/0000078778-97-000019.txt
-1627811,edgar/data/1627811/0001493152-19-004568.txt
-78890,edgar/data/78890/0000078890-14-000004.txt
-99250,edgar/data/99250/0000099250-00-000002.txt
-78100,edgar/data/78100/0001109357-20-000053.txt
-700949,edgar/data/700949/0000892626-96-000081.txt
-1468174,edgar/data/1468174/0001468174-21-000011.txt
-805730,edgar/data/805730/0001104659-05-009806.txt
-820242,edgar/data/820242/0000912057-01-517770.txt
-52795,edgar/data/52795/0000950137-00-000865.txt
-944130,edgar/data/944130/0001432093-11-000164.txt
-66901,edgar/data/66901/0000065984-96-000046.txt
-722077,edgar/data/722077/0001047469-15-002056.txt
-103872,edgar/data/103872/0001193125-13-444053.txt
-1065201,edgar/data/1065201/0001193125-10-070085.txt
-729213,edgar/data/729213/0001038838-01-000141.txt
-1383414,edgar/data/1383414/0001193125-14-409216.txt
-1493594,edgar/data/1493594/0001493594-19-000064.txt
-1039399,edgar/data/1039399/0001039399-20-000011.txt
-943452,edgar/data/943452/0001193125-07-043570.txt
-944739,edgar/data/944739/0001193125-06-035399.txt
-61986,edgar/data/61986/0000061986-99-000003.txt
-6769,edgar/data/6769/0000950129-03-001523.txt
-319201,edgar/data/319201/0000891618-98-004336.txt
-34067,edgar/data/34067/0001104659-06-016592.txt
-1265245,edgar/data/1265245/0000770944-04-000004.txt
-1066134,edgar/data/1066134/0001193125-08-186978.txt
-789570,edgar/data/789570/0000898430-95-000343.txt
-1273013,edgar/data/1273013/0001104659-07-020456.txt
-88205,edgar/data/88205/0000950168-03-000755.txt
-1286613,edgar/data/1286613/0001140361-18-012880.txt
diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index a7a65d6..6c32768 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -7,6 +7,7 @@
 )
 from .mlflow_resource import (
     MlflowInterface,
+    configure_mlflow,
     get_most_recent_run,
 )
 
diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
index d0fa62b..2e015af 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py
@@ -22,8 +22,11 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-def _configure_mlflow(tracking_uri: str, project: str):
+def configure_mlflow(tracking_uri: str | None = None, project: str | None = None):
     """Do runtime configuration of mlflow."""
+    tracking_uri = tracking_uri if tracking_uri else os.getenv("MLFLOW_TRACKING_URI")
+    project = project if project else os.getenv("GCS_PROJECT")
+
     os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
     os.environ["MLFLOW_TRACKING_PASSWORD"] = _get_tracking_password(
         tracking_uri, project
@@ -90,7 +93,7 @@ def yield_for_execution(
         """Create experiment tracker for specified experiment."""
         dagster_run_id = context.run_id
         self._mlflow_run_id = None
-        _configure_mlflow(self.tracking_uri, self.project)
+        configure_mlflow(self.tracking_uri, self.project)
 
         if self.tracking_enabled:
             # Get run_id associated with current dagster run
diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py
index 62c1825..dfe5afb 100644
--- a/src/mozilla_sec_eia/library/validation_helpers.py
+++ b/src/mozilla_sec_eia/library/validation_helpers.py
@@ -5,6 +5,18 @@
 import pandas as pd
 
 
+def load_training_data(
+    filename: str, index_cols: list[str] | None = None
+) -> pd.DataFrame:
+    """Load csv with validation data from `package_data` directory."""
+    df = pd.read_csv(
+        resources.files("mozilla_sec_eia.package_data.training_data") / filename
+    )
+    if index_cols is not None:
+        df = df.set_index(index_cols)
+    return df
+
+
 def load_validation_data(
     filename: str, index_cols: list[str] | None = None
 ) -> pd.DataFrame:
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 493c69a..b531308 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -162,7 +162,7 @@ def outputs_bucket_path(self):
         assert path.exists(), "Outputs bucket path does not exist"
         return path
 
-    def get_metadata(self, year_quarter: str | None = None) -> pd:
+    def get_metadata(self, year_quarter: str | None = None) -> pd.DataFrame:
         """Return dataframe of filing metadata."""
         selection = None
         if year_quarter is not None:
@@ -176,10 +176,7 @@ def get_local_filename(
         self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html"
     ) -> Path:
         """Return path to a filing in local cache based on metadata."""
-        if isinstance(filing, pd.Series):
-            filename = filing["filename"]
-        else:
-            filename = filing.filename
+        filename = filing.name if isinstance(filing, pd.Series) else filing.filename
         return cache_directory / Path(
             f"{filename.replace('edgar/data/', '').replace('/', '-')}".replace(
                 ".txt", extension
@@ -201,16 +198,17 @@ def get_filings(
             cache_pdf: Boolean indicating whether to also cache a PDF of the Ex. 21
         """
         filings = []
-        for _, filing in filing_selection.iterrows():
+        for filename, filing in filing_selection.iterrows():
             local_path = self.get_local_filename(cache_directory, filing)
+            filepath = f"sec10k/sec10k-{filing.year_quarter}/{filename}"
             if not local_path.exists():
                 with local_path.open("w") as f:
-                    f.write((self.filings_bucket_path / filing.filename).read_text())
+                    f.write((self.filings_bucket_path / filepath).read_text())
 
             with local_path.open() as f:
                 sec10k_filing = Sec10K.from_file(
                     file=f,
-                    filename=filing["filename"],
+                    filename=filename,
                     cik=filing["cik"],
                     year_quarter=filing["year_quarter"],
                     ex_21_version=filing["exhibit_21_version"],
@@ -263,13 +261,13 @@ def cache_training_data(
         json_cache_path.mkdir(parents=True, exist_ok=True)
         pdf_cache_path.mkdir(parents=True, exist_ok=True)
         metadata_df = self.get_metadata()
-        label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)")
+        label_name_pattern = re.compile(r"(\d+)-(.+)")
 
         # Cache filings and labels
         filenames = []
         direc = self.labels_bucket_path / gcs_folder_name
         for file in direc.iterdir():
-            if file.name == gcs_folder_name:
+            if file.name in gcs_folder_name:
                 continue
             # Cache labels
             with (json_cache_path / file.name).open("w") as f:
@@ -279,10 +277,10 @@ def cache_training_data(
             match = label_name_pattern.search(file.name)
             filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt")
 
-        filings = metadata_df[metadata_df["filename"].isin(filenames)]
+        filings = metadata_df[metadata_df.index.isin(filenames)]
         self.get_filings(
             filings,
-            cache_path=pdf_cache_path,
+            cache_directory=pdf_cache_path,
             cache_pdf=True,
         )
 
@@ -296,7 +294,7 @@ def validate_archive(self) -> bool:
 
         # Get metadata df
         logger.info("Get list of files in metadata.")
-        metadata_filenames = set(self.get_metadata()["filename"])
+        metadata_filenames = set(self.get_metadata().index)
 
         if not (valid := archive_filenames == metadata_filenames):
             logger.warning("Archive validation failed.")
diff --git a/tests/unit/models/sec10k/utils_test.py b/tests/unit/models/sec10k/utils_test.py
index 1bb7905..845ef2f 100644
--- a/tests/unit/models/sec10k/utils_test.py
+++ b/tests/unit/models/sec10k/utils_test.py
@@ -89,7 +89,9 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo
         new=archive_files,
     ):
         metadata_mock = mocker.MagicMock(
-            return_value=pd.DataFrame({"filename": metadata_files})
+            return_value=pd.DataFrame({"filename": metadata_files}).set_index(
+                "filename"
+            )
         )
         mocker.patch(
             "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata",

From 81813a7b3331f981577d4586f7576748598aabc6 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 24 Sep 2024 12:25:56 -0400
Subject: [PATCH 081/161] Create dataset as dataframe for logging

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index d69ddb5..658ab72 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -73,7 +73,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
 
 def create_inference_dataset(
     filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False
-) -> tuple[pd.DataFrame, Dataset]:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Create a Hugging Face Dataset from PDFs for inference."""
     filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()]
 
@@ -114,8 +114,7 @@ def create_inference_dataset(
             )
         annotations.append(annotation)
 
-    dataset = Dataset.from_list(annotations)
-    return extraction_metadata, dataset
+    return extraction_metadata, pd.DataFrame(annotations)
 
 
 def clean_extracted_df(extracted_df):

From 5174ed776b726890eb1aacf0efb7f9545a6b6b3e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 24 Sep 2024 14:51:39 -0400
Subject: [PATCH 082/161] Modify dataset return type

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 658ab72..7c743bd 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -99,14 +99,16 @@ def create_inference_dataset(
         image_dict = get_image_dict(pdfs_dir)
 
     annotations = []
-    for filename in image_dict:
+    for filename, image in image_dict.items():
         annotation = {
             "id": filename,
             "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename],
             "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF]
             .to_numpy()
             .tolist(),
-            "image": image_dict[filename],
+            "image": image.tobytes(),
+            "mode": image.mode,
+            "size": image.size,
         }
         if has_labels:
             annotation["ner_tags"] = (

From 7a572c07967840da51cda298c9c2216d4c1202dd Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 24 Sep 2024 15:03:01 -0400
Subject: [PATCH 083/161] Fix dataset types for model signature

---
 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 7c743bd..5de6eb9 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -108,7 +108,8 @@ def create_inference_dataset(
             .tolist(),
             "image": image.tobytes(),
             "mode": image.mode,
-            "size": image.size,
+            "width": image.size[0],
+            "height": image.size[1],
         }
         if has_labels:
             annotation["ner_tags"] = (

From 5728026605781f098c1c40cd1b076d84c0f1b5ba Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 25 Sep 2024 14:24:51 -0400
Subject: [PATCH 084/161] Migrate ex 21 model training to a notebook

---
 .../library/mlflow/__init__.py                |    1 +
 .../library/mlflow/mlflow_io_managers.py      |   26 +
 src/mozilla_sec_eia/models/sec10k/__init__.py |   46 +-
 .../models/sec10k/ex_21/__init__.py           |  184 +--
 .../models/sec10k/ex_21/data.py               |    1 +
 .../models/sec10k/ex_21/inference.py          |  385 ++----
 .../models/sec10k/ex_21/train_extractor.py    |  192 ---
 .../notebooks/exhibit21_extractor.ipynb       | 1084 +++++++++++++++++
 .../train_exhibit21_extraction.ipynb          | 1045 ----------------
 .../models/sec10k/utils/cloud.py              |    2 +-
 tests/unit/models/sec10k/ex21_model_test.py   |    6 +-
 11 files changed, 1239 insertions(+), 1733 deletions(-)
 create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data.py
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index 6c32768..17a765d 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -4,6 +4,7 @@
     MlflowBaseIOManager,
     MlflowMetricsIOManager,
     MlflowPandasArtifactIOManager,
+    MlflowPyfuncModelIOManager,
 )
 from .mlflow_resource import (
     MlflowInterface,
diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 7aa05d7..94468f5 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -26,6 +26,32 @@ def _get_run_info(self) -> Run:
         return mlflow.get_run(self.mlflow_interface.mlflow_run_id)
 
 
+class MlflowPyfuncModelIOManager(MlflowBaseIOManager):
+    """IO Manager to load pyfunc models from tracking server."""
+
+    uri: str | None = None
+
+    def handle_output(self, context, obj):
+        """Outputs not implemented."""
+        raise NotImplementedError("Logging models not supported by io manager.")
+
+    def load_input(self, context: InputContext):
+        """Load pyfunc model with mlflow server."""
+        cache_path = (
+            self.mlflow_interface.dagster_home_path / "model_cache" / context.name
+        )
+        cache_path.mkdir(exist_ok=True, parents=True)
+
+        model_uri = self.uri
+        if model_uri is None:
+            model_uri = f"models:/{context.name}"
+
+        mlflow.pyfunc.load_model(
+            model_uri,
+            dst_path=cache_path,
+        )
+
+
 class MlflowPandasArtifactIOManager(MlflowBaseIOManager):
     """Implement IO manager for logging/loading dataframes as mlflow artifacts."""
 
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index fd9a866..b482aec 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -1,26 +1,33 @@
 """Implement models to extract data from SEC10k filings."""
 
 from dagster import (
+    Config,
     Definitions,
+    define_asset_job,
+    file_relative_path,
+    in_process_executor,
     load_assets_from_modules,
     load_assets_from_package_module,
 )
+from dagstermill import (
+    ConfigurableLocalOutputNotebookIOManager,
+    define_dagstermill_asset,
+)
 from upath import UPath
 
 from mozilla_sec_eia.library import model_jobs
 from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager
 from mozilla_sec_eia.library.mlflow import (
+    MlflowPyfuncModelIOManager,
     mlflow_interface_resource,
     mlflow_train_test_io_managers,
 )
 
 from . import basic_10k, ex_21, extract
 from .utils.cloud import cloud_interface_resource
-from .utils.layoutlm import LayoutlmIOManager
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
-layoutlm_assets = load_assets_from_modules([ex_21.train_extractor])
 shared_assets = load_assets_from_modules([extract])
 
 basic_10k_production_job = model_jobs.create_production_model_job(
@@ -40,36 +47,45 @@
     concurrency_limit=4,
 )
 
-ex21_validation_job = model_jobs.create_validation_model_job(
-    "ex21_extraction_validation",
-    ex_21.validation_assets,
-)
 
-layoutlm_finetune_job = model_jobs.create_training_job(
-    "layoutlm_finetune",
-    layoutlm_assets,
+class TrainConfig(Config):
+    """Config for training notebook."""
+
+    uri: str = "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor"
+    training_set: str = "labeledv0.2"
+
+
+exhibit21_extractor = define_dagstermill_asset(
+    name="exhibit21_extractor",
+    notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"),
+    config_schema=TrainConfig.to_config_schema(),
+)
+ex21_training_job = define_asset_job(
+    "ex21_training",
+    selection=[exhibit21_extractor],
+    executor_def=in_process_executor,
 )
 
 
 defs = Definitions(
-    assets=basic_10k_assets + ex21_assets + shared_assets + layoutlm_assets,
+    assets=basic_10k_assets + ex21_assets + shared_assets + [exhibit21_extractor],
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
         ex21_production_job,
-        ex21_validation_job,
-        layoutlm_finetune_job,
+        ex21_training_job,
     ],
     resources={
         "cloud_interface": cloud_interface_resource,
         "mlflow_interface": mlflow_interface_resource,
-        "layoutlm_io_manager": LayoutlmIOManager(
-            mlflow_interface=mlflow_interface_resource
+        "layoutlm_io_manager": MlflowPyfuncModelIOManager(
+            mlflow_interface=mlflow_interface_resource,
+            uri="runs:/b959cfa0ba3c4b91a0f8fe158cd0109f/exhibit21_extractor",
         ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs")
         ),
-        "exhibit21_extractor": ex_21.exhibit_21_extractor_resource,
+        "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
     }
     | mlflow_train_test_io_managers,
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index ce971c5..2daf5ae 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -4,9 +4,17 @@
 
 import mlflow
 import pandas as pd
-from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op
+from dagster import (
+    AssetIn,
+    AssetOut,
+    In,
+    Out,
+    asset,
+    graph_multi_asset,
+    multi_asset,
+    op,
+)
 
-from mozilla_sec_eia.library import validation_helpers
 from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
 
 from ..entities import (
@@ -17,147 +25,24 @@
 )
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
 from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
-from .inference import Exhibit21Extractor, clean_extracted_df, extract_filings
+from .inference import extract_filings
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-@asset(dagster_type=ex21_extract_type)
-def ex21_validation_set() -> pd.DataFrame:
-    """Return dataframe containing exhibit 21 validation data."""
-    return clean_ex21_validation_set(
-        validation_helpers.load_validation_data("ex21_labels.csv")
-    )
-
-
-@asset
-def ex21_validation_filing_metadata(
-    cloud_interface: GCSArchive,
-    ex21_validation_set: pd.DataFrame,
-) -> pd.DataFrame:
-    """Get sec 10k filing metadata from validation set."""
-    filing_metadata = cloud_interface.get_metadata()
-    return filing_metadata[
-        filing_metadata.index.isin(ex21_validation_set["filename"].unique())
-    ]
-
-
-@multi_asset(
-    ins={
-        "computed_df": AssetIn("ex21_company_ownership_info_validation"),
-        "validation_df": AssetIn("ex21_validation_set"),
-    },
-    outs={
-        "ex21_jaccard_per_table": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_precision_recall_per_table": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_incorrect_filenames": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager"
-        ),
-        "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"),
-    },
-)
-def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
-    """Compute validation metrics for Ex. 21 extraction."""
-    shared_cols = validation_df.columns.intersection(computed_df.columns)
-    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
-    n_equal = 0
-    validation_filenames = validation_df["id"].unique()
-    n_files = len(validation_filenames)
-    table_metrics_dict = {}
-    jaccard_dict = {}
-    incorrect_files = []
-    # iterate through each file and check each extracted table
-    for filename in validation_filenames:
-        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
-            drop=True
-        )
-        validation_table_df = validation_df[
-            validation_df["id"] == filename
-        ].reset_index(drop=True)
-        # check if the tables are exactly equal
-        if extracted_table_df.equals(validation_table_df):
-            # TODO: strip llc and other company strings before comparison
-            n_equal += 1
-        else:
-            incorrect_files.append(filename)
-        # compute precision and recall for each column
-        table_metrics_dict[filename] = {}
-        jaccard_dict[filename] = {}
-        for col in ["subsidiary", "loc", "own_per"]:
-            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
-                extracted_table_df, validation_table_df, value_col=col
-            )
-            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
-                "precision"
-            ]
-            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
-            # get the jaccard similarity between columns
-            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
-                computed_df=extracted_table_df,
-                validation_df=validation_table_df,
-                value_col=col,
-            )
-
-    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
-    prec_recall_df = pd.DataFrame.from_dict(
-        table_metrics_dict, orient="index"
-    ).reset_index()
-
-    return (
-        jaccard_df,
-        prec_recall_df,
-        pd.DataFrame({"filename": incorrect_files}),
-        {
-            "table_accuracy": n_equal / n_files,
-            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
-            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
-            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
-            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
-            / n_files,
-            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
-            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
-            / n_files,
-            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
-            / n_files,
-            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
-            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
-        },
-    )
-
-
-def clean_ex21_validation_set(validation_df: pd.DataFrame):
-    """Clean Ex. 21 validation data to match extracted format."""
-    validation_df = validation_df.rename(
-        columns={
-            "Filename": "id",
-            "Subsidiary": "subsidiary",
-            "Location of Incorporation": "loc",
-            "Ownership Percentage": "own_per",
-        }
-    )
-    validation_df["own_per"] = validation_df["own_per"].astype(str)
-    validation_df["filename"] = validation_df["id"].apply(get_metadata_filename)
-    validation_df = clean_extracted_df(validation_df)
-    return validation_df
-
-
 @op(
     out={
         "metadata": Out(dagster_type=sec10k_extract_metadata_type),
         "extracted": Out(dagster_type=ex21_extract_type),
-    }
+    },
+    ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")},
 )
 def extract_filing_chunk(
-    exhibit21_extractor: Exhibit21Extractor,
     filings: pd.DataFrame,
-    layoutlm,
+    exhibit21_extractor,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
-    return extract_filings(exhibit21_extractor, filings, layoutlm)
+    return extract_filings(filings, exhibit21_extractor)
 
 
 @op(
@@ -196,17 +81,15 @@ def collect_extracted_chunks(
             io_manager_key="pandas_parquet_io_manager"
         ),
     },
-    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
     partitions_def=year_quarter_partitions,
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
-    layoutlm,
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
     metadata_chunks, extracted_chunks = filing_chunks.map(
-        lambda filings: extract_filing_chunk(filings, layoutlm)
+        lambda filings: extract_filing_chunk(filings)
     )
     metadata, extracted = collect_extracted_chunks(
         metadata_chunks.collect(), extracted_chunks.collect()
@@ -215,39 +98,4 @@ def ex21_extract(
     return metadata, extracted
 
 
-@multi_asset(
-    outs={
-        "ex21_extraction_metadata_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager",
-            dagster_type=sec10k_extract_metadata_type,
-        ),
-        "ex21_company_ownership_info_validation": AssetOut(
-            io_manager_key="mlflow_pandas_artifact_io_manager",
-            dagster_type=ex21_extract_type,
-        ),
-    },
-    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
-)
-def ex21_extract_validation(
-    ex21_validation_filing_metadata: pd.DataFrame,
-    exhibit21_extractor: Exhibit21Extractor,
-    layoutlm,
-):
-    """Extract ownership info from exhibit 21 docs."""
-    return extract_filings(
-        exhibit21_extractor, ex21_validation_filing_metadata, layoutlm
-    )
-
-
-exhibit_21_extractor_resource = Exhibit21Extractor(
-    cloud_interface=cloud_interface_resource,
-)
-
 production_assets = [sec10k_filing_metadata, ex21_extract]
-
-validation_assets = [
-    ex21_validation_set,
-    ex21_validation_filing_metadata,
-    ex21_extract_validation,
-    ex21_validation_metrics,
-]
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py
new file mode 100644
index 0000000..4e331c8
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py
@@ -0,0 +1 @@
+"""Define methods and assets for handling datasets used by."""
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 5de6eb9..5633e40 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -4,25 +4,15 @@
 import os
 import tempfile
 import traceback
-from contextlib import contextmanager
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
-import torch
-from dagster import ConfigurableResource
-from datasets import Dataset
-from transformers import (
-    Pipeline,
-    pipeline,
-)
-from transformers.tokenization_utils_base import BatchEncoding
+from mlflow.pyfunc import PyFuncModel
 
-from ..entities import Ex21CompanyOwnership, Sec10kExtractionMetadata
-from ..utils.cloud import GCSArchive, get_metadata_filename
+from ..entities import Ex21CompanyOwnership
+from ..utils.cloud import GCSArchive
 from ..utils.layoutlm import (
-    get_id_label_conversions,
-    iob_to_label,
     normalize_bboxes,
 )
 from ..utils.pdf import (
@@ -33,7 +23,6 @@
     format_label_studio_output,
     get_image_dict,
 )
-from .train_extractor import BBOX_COLS, LABELS
 
 # When handling multi page documents LayoutLM uses a sliding 'frame'
 # with some overlap between frames. The overlap creates multiple
@@ -50,6 +39,19 @@
     "O",
 ]
 
+LABELS = [
+    "O",
+    "B-Subsidiary",
+    "I-Subsidiary",
+    "B-Loc",
+    "I-Loc",
+    "B-Own_Per",
+    "I-Own_Per",
+]
+
+BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]
+label2id = {v: k for k, v in enumerate(LABELS)}
+
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
@@ -71,6 +73,41 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
     return inference_df
 
 
+def _cache_pdfs(
+    filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path
+) -> pd.DataFrame:
+    """Iterate filings and cache pdfs."""
+    extraction_metadata = pd.DataFrame(
+        {
+            "filename": pd.Series(dtype=str),
+            "success": pd.Series(dtype=bool),
+            "notes": pd.Series(dtype=str),
+        }
+    ).set_index("filename")
+
+    for filing in cloud_interface.iterate_filings(filings):
+        pdf_path = cloud_interface.get_local_filename(
+            cache_directory=pdf_dir, filing=filing, extension=".pdf"
+        )
+
+        # Some filings are poorly formatted and fail in `save_as_pdf`
+        # We want a record of these but don't want to stop run
+        try:
+            with pdf_path.open("wb") as f:
+                filing.ex_21.save_as_pdf(f)
+        except Exception as e:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = str(e)
+
+        # Some pdfs are empty. Check for these and remove from dir
+        if pdf_path.stat().st_size == 0:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty"
+            pdf_path.unlink()
+
+    return extraction_metadata
+
+
 def create_inference_dataset(
     filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
@@ -120,42 +157,34 @@ def create_inference_dataset(
     return extraction_metadata, pd.DataFrame(annotations)
 
 
-def clean_extracted_df(extracted_df):
-    """Perform basic cleaning on a dataframe extracted from an Ex. 21."""
-    if extracted_df.empty:
-        return extracted_df
-    if "row" in extracted_df.columns:
-        extracted_df = extracted_df.drop(columns=["row"])
-    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.strip().str.lower()
-    # strip special chars from the start and end of the string
-    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.replace(
-        r"^[^\w&\s]+|[^\w&\s]+$", "", regex=True
-    )
-    if "loc" in extracted_df.columns:
-        extracted_df["loc"] = extracted_df["loc"].str.strip().str.lower()
-        extracted_df["loc"] = extracted_df["loc"].str.replace(
-            r"[^a-zA-Z&,\s]", "", regex=True
-        )
-    if "own_per" in extracted_df.columns:
-        # remove special chars and letters
-        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
-            r"[^\d.]", "", regex=True
-        )
-        # Find values with multiple decimal points
-        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
-            r"(\d*\.\d+)\..*", r"\1", regex=True
-        )
-        extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan)
-        extracted_df["own_per"] = extracted_df["own_per"].astype(
-            "float64", errors="ignore"
+def extract_filings(
+    filings: pd.DataFrame,
+    layoutlm: PyFuncModel,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Create huggingface dataset from filings and perform extraction."""
+    try:
+        failed_metadata, dataset = create_inference_dataset(
+            filing_metadata=filings,
+            cloud_interface=GCSArchive(),
+            has_labels=False,
         )
-    # drop rows that have a null subsidiary value
-    extracted_df = extracted_df.dropna(subset="subsidiary")
-    return extracted_df
+        metadata, extracted = layoutlm.predict(dataset)
+        metadata = pd.concat([failed_metadata, metadata])
+    except Exception as e:
+        logger.warning(traceback.format_exc())
+        logger.warning(f"Error while extracting filings: {filings.index}")
+        metadata = pd.DataFrame(
+            {
+                "filename": filings.index,
+                "success": [False] * len(filings),
+                "notes": [str(e)] * len(filings),
+            }
+        ).set_index("filename")
+        extracted = Ex21CompanyOwnership.example(size=0)
+    return metadata, extracted
 
 
 def _sort_by_label_priority(target_array):
-    _, label2id = get_id_label_conversions(LABELS)
     id_priority = [label2id[label] for label in LABEL_PRIORITY]
     # Create a priority map from the label priority
     priority_map = {val: idx for idx, val in enumerate(id_priority)}
@@ -204,267 +233,3 @@ def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):
     flattened_modes = modes[inverse_indices]
 
     return flattened_modes
-
-
-def _cache_pdfs(
-    filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path
-) -> pd.DataFrame:
-    """Iterate filings and cache pdfs."""
-    extraction_metadata = pd.DataFrame(
-        {
-            "filename": pd.Series(dtype=str),
-            "success": pd.Series(dtype=bool),
-            "notes": pd.Series(dtype=str),
-        }
-    ).set_index("filename")
-
-    for filing in cloud_interface.iterate_filings(filings):
-        pdf_path = cloud_interface.get_local_filename(
-            cache_directory=pdf_dir, filing=filing, extension=".pdf"
-        )
-
-        # Some filings are poorly formatted and fail in `save_as_pdf`
-        # We want a record of these but don't want to stop run
-        try:
-            with pdf_path.open("wb") as f:
-                filing.ex_21.save_as_pdf(f)
-        except Exception as e:
-            extraction_metadata.loc[filing.filename, ["success"]] = False
-            extraction_metadata.loc[filing.filename, ["note"]] = str(e)
-
-        # Some pdfs are empty. Check for these and remove from dir
-        if pdf_path.stat().st_size == 0:
-            extraction_metadata.loc[filing.filename, ["success"]] = False
-            extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty"
-            pdf_path.unlink()
-
-    return extraction_metadata
-
-
-def _get_data(dataset):
-    yield from dataset
-
-
-class Exhibit21Extractor(ConfigurableResource):
-    """Implement `Sec10kExtractor` interface for exhibit 21 data."""
-
-    cloud_interface: GCSArchive
-    name: str = "exhibit21_extractor"
-    device: str = "cpu"
-    has_labels: bool = False
-    dataset_ind: list | None = None
-
-    @contextmanager
-    def setup_for_execution(self, context):
-        """Set env variable to improve GPU memory access."""
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-
-    def extract_filings(
-        self, dataset: Dataset, model, processor
-    ) -> tuple[pd.DataFrame, pd.DataFrame]:
-        """Predict entities with a fine-tuned model and extract Ex. 21 tables."""
-        if self.dataset_ind:
-            dataset = dataset.select(self.dataset_ind)
-
-        # TODO: figure out device argument
-        pipe = pipeline(
-            "token-classification",
-            model=model,
-            tokenizer=processor,
-            pipeline_class=LayoutLMInferencePipeline,
-            device=self.device,
-        )
-
-        logits = []
-        predictions = []
-        all_output_df = Ex21CompanyOwnership.example(size=0)
-        extraction_metadata = Sec10kExtractionMetadata.example(size=0)
-        for logit, pred, output_df in pipe(_get_data(dataset)):
-            logits.append(logit)
-            predictions.append(pred)
-            if not output_df.empty:
-                filename = get_metadata_filename(output_df["id"].iloc[0])
-                extraction_metadata.loc[filename, ["success"]] = True
-            all_output_df = pd.concat([all_output_df, output_df])
-        all_output_df.columns.name = None
-        all_output_df = clean_extracted_df(all_output_df)
-        all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]]
-        all_output_df = all_output_df.reset_index(drop=True)
-        return extraction_metadata, all_output_df
-
-
-def extract_filings(
-    exhibit21_extractor: Exhibit21Extractor,
-    filings: pd.DataFrame,
-    layoutlm,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Create huggingface dataset from filings and perform extraction."""
-    try:
-        failed_metadata, dataset = create_inference_dataset(
-            filing_metadata=filings,
-            cloud_interface=exhibit21_extractor.cloud_interface,
-            has_labels=exhibit21_extractor.has_labels,
-        )
-        metadata, extracted = exhibit21_extractor.extract_filings(
-            dataset,
-            model=layoutlm["model"],
-            processor=layoutlm["tokenizer"],
-        )
-        metadata = pd.concat([failed_metadata, metadata])
-    except Exception as e:
-        logger.warning(traceback.format_exc())
-        logger.warning(f"Error while extracting filings: {filings.index}")
-        metadata = pd.DataFrame(
-            {
-                "filename": filings.index,
-                "success": [False] * len(filings),
-                "notes": [str(e)] * len(filings),
-            }
-        ).set_index("filename")
-        extracted = Ex21CompanyOwnership.example(size=0)
-    return metadata, extracted
-
-
-class LayoutLMInferencePipeline(Pipeline):
-    """Pipeline for performing inference with fine-tuned LayoutLM."""
-
-    def __init__(self, *args, **kwargs):
-        """Initialize LayoutLMInferencePipeline."""
-        super().__init__(*args, **kwargs)
-
-    def _sanitize_parameters(self, **kwargs):
-        preprocess_kwargs = {}
-        if "maybe_arg" in kwargs:
-            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
-        return preprocess_kwargs, {}, {}
-
-    def preprocess(self, doc_dict):
-        """Encode and tokenize model inputs."""
-        image = doc_dict["image"]
-        words = doc_dict["tokens"]
-        boxes = doc_dict["bboxes"]
-        encoding = self.tokenizer(
-            image,
-            words,
-            boxes=boxes,
-            return_tensors="pt",
-            truncation=True,
-            padding="max_length",
-            max_length=512,  # this is the maximum max_length
-            stride=128,
-            return_offsets_mapping=True,
-            return_overflowing_tokens=True,
-        )
-        model_inputs = {}
-        model_inputs["raw_encoding"] = encoding.copy()
-        model_inputs["doc_dict"] = doc_dict
-        model_inputs["offset_mapping"] = encoding.pop("offset_mapping")
-        model_inputs["sample_mapping"] = encoding.pop("overflow_to_sample_mapping")
-        # TODO: do we actually need to make these into ints?
-        encoding["input_ids"] = encoding["input_ids"].to(torch.int64)
-        encoding["attention_mask"] = encoding["attention_mask"].to(torch.int64)
-        encoding["bbox"] = encoding["bbox"].to(torch.int64)
-        encoding["pixel_values"] = torch.stack(encoding["pixel_values"])
-        model_inputs["encoding"] = encoding
-        return model_inputs
-
-    def _forward(self, model_inputs):
-        # encoding is passed as a UserDict in the model_inputs dictionary
-        # turn it back into a BatchEncoding
-        encoding = BatchEncoding(model_inputs["encoding"])
-        if torch.cuda.is_available():
-            encoding.to("cuda")
-            self.model.to("cuda")
-        # since we're doing inference, we don't need gradient computation
-        with torch.no_grad():
-            output = self.model(**encoding)
-            return {
-                "logits": output.logits,
-                "predictions": output.logits.argmax(-1).squeeze().tolist(),
-                "raw_encoding": model_inputs["raw_encoding"],
-                "doc_dict": model_inputs["doc_dict"],
-            }
-
-    def postprocess(self, all_outputs):
-        """Return logits, model predictions, and the extracted dataframe."""
-        logits = all_outputs["logits"]
-        predictions = all_outputs["logits"].argmax(-1).squeeze().tolist()
-        output_df = self.extract_table(all_outputs)
-        return logits, predictions, output_df
-
-    def extract_table(self, all_outputs):
-        """Extract a structured table from a set of inference predictions.
-
-        This function essentially works by stacking bounding boxes and predictions
-        into a dataframe and going from left to right and top to bottom. Then, every
-        every time a new subsidiary entity is encountered, it assigns a new group or
-        "row" to that subsidiary. Next, location and ownership percentage words/labeled
-        entities in between these subsidiary groups are assigned to a subsidiary row/group.
-        Finally, this is all formatted into a dataframe with an ID column from the original
-        filename and a basic cleaning function normalizes strings.
-        """
-        # TODO: when model more mature, break this into sub functions to make it
-        # clearer what's going on
-        predictions = all_outputs["predictions"]
-        encoding = all_outputs["raw_encoding"]
-        doc_dict = all_outputs["doc_dict"]
-
-        token_boxes_tensor = encoding["bbox"].flatten(start_dim=0, end_dim=1)
-        predictions_tensor = torch.tensor(predictions)
-        mode_predictions = get_flattened_mode_predictions(
-            token_boxes_tensor, predictions_tensor
-        )
-        token_boxes = encoding["bbox"].flatten(start_dim=0, end_dim=1).tolist()
-        predicted_labels = [
-            self.model.config.id2label[pred] for pred in mode_predictions
-        ]
-        simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]
-
-        df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)
-        df.loc[:, "iob_pred"] = predicted_labels
-        df.loc[:, "pred"] = simple_preds
-        invalid_mask = (
-            (df["top_left_x"] == 0)
-            & (df["top_left_y"] == 0)
-            & (df["bottom_right_x"] == 0)
-            & (df["bottom_right_y"] == 0)
-        )
-        df = df[~invalid_mask]
-        # we want to get actual words on the dataframe, not just subwords that correspond to tokens
-        # subwords from the same word share the same bounding box coordinates
-        # so we merge the original words onto our dataframe on bbox coordinates
-        words_df = pd.DataFrame(data=doc_dict["bboxes"], columns=BBOX_COLS)
-        words_df.loc[:, "word"] = doc_dict["tokens"]
-        df = df.merge(words_df, how="left", on=BBOX_COLS).drop_duplicates(
-            subset=BBOX_COLS + ["pred", "word"]
-        )
-        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)
-        # should always have a B entity label. Manually override labels so this is true.
-        first_in_group_df = df[
-            (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other")
-        ]
-        first_in_group_df.loc[:, "iob_pred"] = (
-            "B" + first_in_group_df["iob_pred"].str[1:]
-        )
-        df.update(first_in_group_df)
-        # filter for just words that were labeled with non "other" entities
-        entities_df = df.sort_values(by=["top_left_y", "top_left_x"])
-        entities_df = entities_df[entities_df["pred"] != "other"]
-        # words are labeled with IOB format which stands for inside, outside, beginning
-        # merge B and I entities to form one entity group
-        # (i.e. "B-Subsidiary" and "I-Subsidiary" become just "subsidiary"), assign a group ID
-        entities_df["group"] = (entities_df["iob_pred"].str.startswith("B-")).cumsum()
-        grouped_df = (
-            entities_df.groupby(["group", "pred"])["word"]
-            .apply(" ".join)
-            .reset_index()[["pred", "word"]]
-        )
-        # assign a new row every time there's a new subsidiary
-        grouped_df["row"] = (grouped_df["pred"].str.startswith("subsidiary")).cumsum()
-        output_df = grouped_df.pivot_table(
-            index="row", columns="pred", values="word", aggfunc=lambda x: " ".join(x)
-        ).reset_index()
-        if output_df.empty:
-            return output_df
-        output_df.loc[:, "id"] = doc_dict["id"]
-        return output_df
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
deleted file mode 100644
index cb37619..0000000
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py
+++ /dev/null
@@ -1,192 +0,0 @@
-"""Fine-tune LayoutLM to extract Ex. 21 tables.
-
-This module uses a labeled training dataset to fine-tune
-a LayoutLM model to extract unstructured Exhibit 21 tables
-from SEC 10K filings.
-"""
-
-from pathlib import Path
-
-import numpy as np
-from dagster import Config, asset
-from datasets import (
-    Array2D,
-    Array3D,
-    Dataset,
-    Features,
-    Sequence,
-    Value,
-    load_metric,
-)
-from transformers import (
-    AutoProcessor,
-    LayoutLMv3ForTokenClassification,
-    Trainer,
-    TrainingArguments,
-)
-from transformers.data.data_collator import default_data_collator
-
-from ..utils.layoutlm import get_id_label_conversions
-from .create_labeled_dataset import format_as_ner_annotations
-
-LABELS = [
-    "O",
-    "B-Subsidiary",
-    "I-Subsidiary",
-    "B-Loc",
-    "I-Loc",
-    "B-Own_Per",
-    "I-Own_Per",
-]
-
-BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]
-
-
-def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):
-    """Compute metrics to train and evaluate the model on."""
-    predictions, labels = p
-    predictions = np.argmax(predictions, axis=2)
-
-    # Remove ignored index (special tokens)
-    true_predictions = [
-        [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
-        for prediction, label in zip(predictions, labels)
-    ]
-    true_labels = [
-        [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
-        for prediction, label in zip(predictions, labels)
-    ]
-
-    results = metric.compute(predictions=true_predictions, references=true_labels)
-    if return_entity_level_metrics:
-        # Unpack nested dictionaries
-        final_results = {}
-        for key, value in results.items():
-            if isinstance(value, dict):
-                for n, v in value.items():
-                    final_results[f"{key}_{n}"] = v
-            else:
-                final_results[key] = value
-        return final_results
-    return {
-        "precision": results["overall_precision"],
-        "recall": results["overall_recall"],
-        "f1": results["overall_f1"],
-        "accuracy": results["overall_accuracy"],
-    }
-
-
-def _prepare_dataset(annotations, processor, label2id):
-    """Put the dataset in its final format for training LayoutLM."""
-
-    def _convert_ner_tags_to_id(ner_tags, label2id):
-        return [int(label2id[ner_tag]) for ner_tag in ner_tags]
-
-    images = annotations["image"]
-    words = annotations["tokens"]
-    boxes = annotations["bboxes"]
-    # Map over labels and convert to numeric id for each ner_tag
-    ner_tags = [
-        _convert_ner_tags_to_id(ner_tags, label2id)
-        for ner_tags in annotations["ner_tags"]
-    ]
-
-    encoding = processor(
-        images,
-        words,
-        boxes=boxes,
-        word_labels=ner_tags,
-        truncation=True,
-        padding="max_length",
-    )
-
-    return encoding
-
-
-def load_test_train_set(
-    processor: AutoProcessor, test_size: float, ner_annotations: list[dict]
-):
-    """Load training/test set and prepare for training or evaluation."""
-    id2label, label2id = get_id_label_conversions(LABELS)
-    # Cache/prepare training data
-    dataset = Dataset.from_list(ner_annotations)
-
-    # Prepare our train & eval dataset
-    column_names = dataset.column_names
-    features = Features(
-        {
-            "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
-            "input_ids": Sequence(feature=Value(dtype="int64")),
-            "attention_mask": Sequence(Value(dtype="int64")),
-            "bbox": Array2D(dtype="int64", shape=(512, 4)),
-            "labels": Sequence(feature=Value(dtype="int64")),
-        }
-    )
-    dataset = dataset.map(
-        lambda annotations: _prepare_dataset(annotations, processor, label2id),
-        batched=True,
-        remove_columns=column_names,
-        features=features,
-    )
-    dataset.set_format("torch")
-    split_dataset = dataset.train_test_split(test_size=test_size)
-    return split_dataset["train"], split_dataset["test"]
-
-
-class FineTuneConfig(Config):
-    """Configuration to supply to `train_model`."""
-
-    labeled_json_path: str = "sec10k_filings/labeled_jsons/"
-    gcs_training_data_dir: str = "labeled"
-    output_dir: str = "layoutlm_trainer"
-    test_size: float = 0.2
-
-
-@asset(io_manager_key="layoutlm_io_manager")
-def layoutlm(
-    config: FineTuneConfig,
-):
-    """Train LayoutLM model with labeled data."""
-    # Prepare model
-    id2label, label2id = get_id_label_conversions(LABELS)
-    model = LayoutLMv3ForTokenClassification.from_pretrained(
-        "microsoft/layoutlmv3-base", id2label=id2label, label2id=label2id
-    )
-    processor = AutoProcessor.from_pretrained(
-        "microsoft/layoutlmv3-base", apply_ocr=False
-    )
-    ner_annotations = format_as_ner_annotations(
-        labeled_json_path=Path(config.labeled_json_path),
-        gcs_folder_name=config.gcs_training_data_dir,
-    )
-    # Get training/test data using pre-trained processor to prepare data
-    train_dataset, eval_dataset = load_test_train_set(
-        processor=processor, test_size=config.test_size, ner_annotations=ner_annotations
-    )
-
-    # Initialize our Trainer
-    metric = load_metric("seqeval")
-    training_args = TrainingArguments(
-        output_dir=config.output_dir,
-        max_steps=1000,
-        per_device_train_batch_size=1,
-        per_device_eval_batch_size=1,
-        learning_rate=1e-5,
-        evaluation_strategy="steps",
-        eval_steps=100,
-        load_best_model_at_end=True,
-        metric_for_best_model="f1",
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        tokenizer=processor,
-        data_collator=default_data_collator,
-        compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),
-    )
-
-    # Train inside mlflow run. Mlflow will automatically handle logging training metrcis
-    trainer.train()
-    return trainer
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
new file mode 100644
index 0000000..4efc905
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -0,0 +1,1084 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0da8c588-2d09-464b-945f-168704c0cdac",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Exhibit 21 extraction\n",
+    "\n",
+    "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n",
+    "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n",
+    "company."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import dagstermill\n",
+    "\n",
+    "context = dagstermill.get_context(op_config={\n",
+    "    \"uri\": \"runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor\",\n",
+    "    \"training_set\": \"labeledv0.2\",\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f299b2b-2358-4526-b023-f29c817316d9",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Train Layoutlmv3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Setup training/test sets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b",
+   "metadata": {},
+   "source": [
+    "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n",
+    "\n",
+    "First define several helper functions to do the conversion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "from tempfile import TemporaryDirectory\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from mozilla_sec_eia.library import validation_helpers\n",
+    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n",
+    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n",
+    "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n",
+    "    get_pdf_data_from_path,\n",
+    "    render_page,\n",
+    ")\n",
+    "\n",
+    "# Set some constants\n",
+    "LABELS = [\n",
+    "    \"O\",\n",
+    "    \"B-Subsidiary\",\n",
+    "    \"I-Subsidiary\",\n",
+    "    \"B-Loc\",\n",
+    "    \"I-Loc\",\n",
+    "    \"B-Own_Per\",\n",
+    "    \"I-Own_Per\",\n",
+    "]\n",
+    "LABEL_PRIORITY = [\n",
+    "    \"I-Subsidiary\",\n",
+    "    \"I-Loc\",\n",
+    "    \"I-Own_Per\",\n",
+    "    \"B-Subsidiary\",\n",
+    "    \"B-Loc\",\n",
+    "    \"B-Own_Per\",\n",
+    "    \"O\",\n",
+    "]\n",
+    "\n",
+    "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n",
+    "BBOX_COLS_PDF = [\n",
+    "    \"top_left_x_pdf\",\n",
+    "    \"top_left_y_pdf\",\n",
+    "    \"bottom_right_x_pdf\",\n",
+    "    \"bottom_right_y_pdf\",\n",
+    "]\n",
+    "\n",
+    "# Map back and forth between id's and labels\n",
+    "id2label = dict(enumerate(LABELS))\n",
+    "label2id = {v: k for k, v in enumerate(LABELS)}\n",
+    "\n",
+    "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n",
+    "    # TODO: for now CIK is stored as an int, update when fixed\n",
+    "    cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n",
+    "    return cik in tracking_df.CIK.unique()\n",
+    "\n",
+    "\n",
+    "def format_label_studio_output(\n",
+    "    labeled_json_dir: Path,\n",
+    "    pdfs_dir: Path,\n",
+    ") -> pd.DataFrame:\n",
+    "    \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n",
+    "    labeled_df = pd.DataFrame()\n",
+    "    # TODO: make this path stuff less janky?\n",
+    "    tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n",
+    "    for json_filename in os.listdir(labeled_json_dir):\n",
+    "        if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n",
+    "            continue\n",
+    "        json_file_path = labeled_json_dir / json_filename\n",
+    "        with Path.open(json_file_path) as j:\n",
+    "            doc_dict = json.loads(j.read())\n",
+    "\n",
+    "        filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n",
+    "        # check if old local naming schema is being used\n",
+    "        if len(filename.split(\"-\")) == 6:\n",
+    "            filename = \"-\".join(filename.split(\"-\")[2:])\n",
+    "        if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n",
+    "            continue\n",
+    "\n",
+    "        pdf_filename = filename + \".pdf\"\n",
+    "        src_path = pdfs_dir / pdf_filename\n",
+    "        extracted, pg = get_pdf_data_from_path(src_path)\n",
+    "        txt = extracted[\"pdf_text\"]\n",
+    "        pg_meta = extracted[\"page\"]\n",
+    "        # normalize bboxes between 0 and 1000 for Hugging Face\n",
+    "        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n",
+    "        # parse the output dictionary of labeled bounding boxes from Label Studio\n",
+    "        doc_df = pd.DataFrame()\n",
+    "        for item in doc_dict[\"result\"]:\n",
+    "            value = item[\"value\"]\n",
+    "            # sometimes Label Studio will fill in an empty list as a label\n",
+    "            # when there is really no label\n",
+    "            # TODO: do this without dict comprehension?\n",
+    "            if (\"labels\" in value) and value[\"labels\"] == []:\n",
+    "                value = {k: v for k, v in value.items() if k != \"labels\"}\n",
+    "            ind = int(item[\"id\"].split(\"_\")[-1])\n",
+    "            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n",
+    "\n",
+    "        # combine the bounding boxes for each word\n",
+    "        doc_df = doc_df.groupby(level=0).first()\n",
+    "        txt.loc[:, \"id\"] = filename\n",
+    "        # TODO: probably want to filter out these empty Ex. 21 docs\n",
+    "        # the doc might not have any labels in it if it was an empty Ex. 21\n",
+    "        if \"labels\" not in doc_df:\n",
+    "            doc_df.loc[:, \"labels\"] = pd.Series()\n",
+    "\n",
+    "        output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n",
+    "        labeled_df = pd.concat([labeled_df, output_df])\n",
+    "\n",
+    "    # fill in unlabeled words and clean up labeled dataframe\n",
+    "    labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n",
+    "    labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n",
+    "    non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n",
+    "    labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n",
+    "\n",
+    "    # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n",
+    "    # that no value is above 1000 or below 0\n",
+    "\n",
+    "    return labeled_df\n",
+    "\n",
+    "\n",
+    "def get_image_dict(pdfs_dir):\n",
+    "    \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n",
+    "    image_dict = {}\n",
+    "    for pdf_filename in os.listdir(pdfs_dir):\n",
+    "        if pdf_filename.split(\".\")[-1] != \"pdf\":\n",
+    "            continue\n",
+    "        pdf_file_path = pdfs_dir / pdf_filename\n",
+    "        _, pg = get_pdf_data_from_path(pdf_file_path)\n",
+    "        full_pg_img = render_page(pg)\n",
+    "        filename = pdf_filename.split(\".\")[0]\n",
+    "        image_dict[filename] = full_pg_img\n",
+    "    return image_dict\n",
+    "\n",
+    "\n",
+    "def format_as_ner_annotations(\n",
+    "    labeled_json_path: Path,\n",
+    "    pdfs_path: Path,\n",
+    "    gcs_folder_name: Path,\n",
+    ") -> list[dict]:\n",
+    "    \"\"\"Format a Label Studio output JSONs as NER annotations.\n",
+    "\n",
+    "    Formats the dataframe as named entity recognition annotations.\n",
+    "    # TODO: say more about this format\n",
+    "\n",
+    "    Returns:\n",
+    "        ner_annotations: a list of dicts, with one dict for each doc.\n",
+    "    \"\"\"\n",
+    "    GCSArchive().cache_training_data(\n",
+    "        json_cache_path=labeled_json_path,\n",
+    "        pdf_cache_path=pdfs_path,\n",
+    "        gcs_folder_name=gcs_folder_name\n",
+    "    )\n",
+    "\n",
+    "    labeled_df = format_label_studio_output(\n",
+    "        labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n",
+    "    )\n",
+    "    # convert dataframe/dictionary into NER format\n",
+    "    # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n",
+    "    # complete dataset is a list of dicts, with one dict for each doc\n",
+    "    doc_filenames = labeled_df[\"id\"].unique()\n",
+    "    image_dict = get_image_dict(pdfs_dir=pdfs_path)\n",
+    "    ner_annotations = []\n",
+    "    for filename in doc_filenames:\n",
+    "        annotation = {\n",
+    "            \"id\": filename,\n",
+    "            \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n",
+    "            \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n",
+    "            \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n",
+    "            .to_numpy()\n",
+    "            .tolist(),\n",
+    "            \"image\": image_dict[filename],\n",
+    "        }\n",
+    "        ner_annotations.append(annotation)\n",
+    "\n",
+    "    return ner_annotations\n",
+    "\n",
+    "def _prepare_dataset(annotations, processor, label2id):\n",
+    "    \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
+    "\n",
+    "    def _convert_ner_tags_to_id(ner_tags, label2id):\n",
+    "        return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
+    "\n",
+    "    images = annotations[\"image\"]\n",
+    "    words = annotations[\"tokens\"]\n",
+    "    boxes = annotations[\"bboxes\"]\n",
+    "    # Map over labels and convert to numeric id for each ner_tag\n",
+    "    ner_tags = [\n",
+    "        _convert_ner_tags_to_id(ner_tags, label2id)\n",
+    "        for ner_tags in annotations[\"ner_tags\"]\n",
+    "    ]\n",
+    "\n",
+    "    encoding = processor(\n",
+    "        images,\n",
+    "        words,\n",
+    "        boxes=boxes,\n",
+    "        word_labels=ner_tags,\n",
+    "        truncation=True,\n",
+    "        padding=\"max_length\",\n",
+    "    )\n",
+    "\n",
+    "    return encoding\n",
+    "\n",
+    "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n",
+    "    \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n",
+    "    predictions, labels = p\n",
+    "    predictions = np.argmax(predictions, axis=2)\n",
+    "\n",
+    "    # Remove ignored index (special tokens)\n",
+    "    true_predictions = [\n",
+    "        [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
+    "        for prediction, label in zip(predictions, labels)\n",
+    "    ]\n",
+    "    true_labels = [\n",
+    "        [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
+    "        for prediction, label in zip(predictions, labels)\n",
+    "    ]\n",
+    "\n",
+    "    results = metric.compute(predictions=true_predictions, references=true_labels)\n",
+    "    if return_entity_level_metrics:\n",
+    "        # Unpack nested dictionaries\n",
+    "        final_results = {}\n",
+    "        for key, value in results.items():\n",
+    "            if isinstance(value, dict):\n",
+    "                for n, v in value.items():\n",
+    "                    final_results[f\"{key}_{n}\"] = v\n",
+    "            else:\n",
+    "                final_results[key] = value\n",
+    "        return final_results\n",
+    "    return {\n",
+    "        \"precision\": results[\"overall_precision\"],\n",
+    "        \"recall\": results[\"overall_recall\"],\n",
+    "        \"f1\": results[\"overall_f1\"],\n",
+    "        \"accuracy\": results[\"overall_accuracy\"],\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8160263c-8f69-437c-918b-e56ad007961a",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Finetune Model\n",
+    "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n",
+    "\n",
+    "Model training contains several steps implemented below:\n",
+    "1. Use temporary path to convert filings to PDF's and stash labels\n",
+    "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n",
+    "3. Construct huggingface dataset from NER annotations and split into train and test sets\n",
+    "4. Load pretrained model from huggingface\n",
+    "5. Finetune model on training data and evaluate on test data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import mlflow\n",
+    "from datasets import (\n",
+    "    Array2D,\n",
+    "    Array3D,\n",
+    "    Dataset,\n",
+    "    Features,\n",
+    "    Sequence,\n",
+    "    Value,\n",
+    "    load_metric,\n",
+    ")\n",
+    "from dotenv import load_dotenv\n",
+    "from transformers import (\n",
+    "    AutoProcessor,\n",
+    "    LayoutLMv3ForTokenClassification,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    ")\n",
+    "from transformers.data.data_collator import default_data_collator\n",
+    "\n",
+    "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "\n",
+    "configure_mlflow()\n",
+    "mlflow.set_experiment(\"exhibit21_extraction_test\")\n",
+    "\n",
+    "# Only finetune if configured to do so\n",
+    "training_run_id = None\n",
+    "if context.op_config[\"uri\"] is None:\n",
+    "    # Change temp_dir to save training data locally for inspection\n",
+    "    with TemporaryDirectory() as temp_dir:\n",
+    "        ner_annotations = format_as_ner_annotations(\n",
+    "            labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n",
+    "            pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n",
+    "            gcs_folder_name=context.op_config[\"training_set\"],\n",
+    "        )\n",
+    "\n",
+    "    # Cache/prepare training data\n",
+    "    dataset = Dataset.from_list(ner_annotations)\n",
+    "\n",
+    "    # Load pretrained model\n",
+    "    model = LayoutLMv3ForTokenClassification.from_pretrained(\n",
+    "        \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n",
+    "    )\n",
+    "    processor = AutoProcessor.from_pretrained(\n",
+    "        \"microsoft/layoutlmv3-base\", apply_ocr=False\n",
+    "    )\n",
+    "\n",
+    "    # Prepare our train & eval dataset\n",
+    "    column_names = dataset.column_names\n",
+    "    features = Features(\n",
+    "        {\n",
+    "            \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n",
+    "            \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n",
+    "            \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n",
+    "            \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n",
+    "            \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n",
+    "        }\n",
+    "    )\n",
+    "    dataset = dataset.map(\n",
+    "        lambda annotations: _prepare_dataset(annotations, processor, label2id),\n",
+    "        batched=True,\n",
+    "        remove_columns=column_names,\n",
+    "        features=features,\n",
+    "    )\n",
+    "    dataset.set_format(\"torch\")\n",
+    "    split_dataset = dataset.train_test_split(test_size=0.2)\n",
+    "    train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n",
+    "\n",
+    "    # Initialize our Trainer\n",
+    "    metric = load_metric(\"seqeval\")\n",
+    "    training_args = TrainingArguments(\n",
+    "        max_steps=1000,\n",
+    "        per_device_train_batch_size=1,\n",
+    "        per_device_eval_batch_size=1,\n",
+    "        learning_rate=1e-5,\n",
+    "        evaluation_strategy=\"steps\",\n",
+    "        eval_steps=100,\n",
+    "        load_best_model_at_end=True,\n",
+    "        metric_for_best_model=\"f1\",\n",
+    "        output_dir=\"./layoutlm\",\n",
+    "    )\n",
+    "    trainer = Trainer(\n",
+    "        model=model,\n",
+    "        args=training_args,\n",
+    "        train_dataset=train_dataset,\n",
+    "        eval_dataset=eval_dataset,\n",
+    "        tokenizer=processor,\n",
+    "        data_collator=default_data_collator,\n",
+    "        compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n",
+    "    )\n",
+    "\n",
+    "    with mlflow.start_run() as training_run:\n",
+    "        # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n",
+    "        trainer.train()\n",
+    "\n",
+    "        # Log finetuend model with mlflow\n",
+    "        model = {\"model\": trainer.model, \"tokenizer\": trainer.tokenizer}\n",
+    "        mlflow.transformers.log_model(\n",
+    "            model, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n",
+    "        )\n",
+    "        training_run_id = training_run.info. run_id"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e",
+   "metadata": {},
+   "source": [
+    "## Model inference\n",
+    "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "42c8e920-d671-40c2-b5db-c43611a33897",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import Pipeline, pipeline\n",
+    "from transformers.tokenization_utils_base import BatchEncoding\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.inference import get_flattened_mode_predictions\n",
+    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n",
+    "    iob_to_label,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "class LayoutLMInferencePipeline(Pipeline):\n",
+    "    \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
+    "\n",
+    "    def __init__(self, *args, **kwargs):\n",
+    "        \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n",
+    "        super().__init__(*args, **kwargs)\n",
+    "\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        preprocess_kwargs = {}\n",
+    "        if \"maybe_arg\" in kwargs:\n",
+    "            preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "\n",
+    "    def preprocess(self, doc_dict):\n",
+    "        \"\"\"Encode and tokenize model inputs.\"\"\"\n",
+    "        image = doc_dict[\"image\"]\n",
+    "        words = doc_dict[\"tokens\"]\n",
+    "        boxes = doc_dict[\"bboxes\"]\n",
+    "        encoding = self.tokenizer(\n",
+    "            image,\n",
+    "            words,\n",
+    "            boxes=boxes,\n",
+    "            return_tensors=\"pt\",\n",
+    "            truncation=True,\n",
+    "            padding=\"max_length\",\n",
+    "            max_length=512,  # this is the maximum max_length\n",
+    "            stride=128,\n",
+    "            return_offsets_mapping=True,\n",
+    "            return_overflowing_tokens=True,\n",
+    "        )\n",
+    "        model_inputs = {}\n",
+    "        model_inputs[\"raw_encoding\"] = encoding.copy()\n",
+    "        model_inputs[\"doc_dict\"] = doc_dict\n",
+    "        model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n",
+    "        model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n",
+    "        # TODO: do we actually need to make these into ints?\n",
+    "        encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n",
+    "        encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n",
+    "        encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n",
+    "        encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n",
+    "        model_inputs[\"encoding\"] = encoding\n",
+    "        return model_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        # encoding is passed as a UserDict in the model_inputs dictionary\n",
+    "        # turn it back into a BatchEncoding\n",
+    "        encoding = BatchEncoding(model_inputs[\"encoding\"])\n",
+    "        if torch.cuda.is_available():\n",
+    "            encoding.to(\"cuda\")\n",
+    "            self.model.to(\"cuda\")\n",
+    "        # since we're doing inference, we don't need gradient computation\n",
+    "        with torch.no_grad():\n",
+    "            output = self.model(**encoding)\n",
+    "            return {\n",
+    "                \"logits\": output.logits,\n",
+    "                \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n",
+    "                \"raw_encoding\": model_inputs[\"raw_encoding\"],\n",
+    "                \"doc_dict\": model_inputs[\"doc_dict\"],\n",
+    "            }\n",
+    "\n",
+    "    def postprocess(self, all_outputs):\n",
+    "        \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n",
+    "        logits = all_outputs[\"logits\"]\n",
+    "        predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n",
+    "        output_df = self.extract_table(all_outputs)\n",
+    "        return logits, predictions, output_df\n",
+    "\n",
+    "    def extract_table(self, all_outputs):\n",
+    "        \"\"\"Extract a structured table from a set of inference predictions.\n",
+    "\n",
+    "        This function essentially works by stacking bounding boxes and predictions\n",
+    "        into a dataframe and going from left to right and top to bottom. Then, every\n",
+    "        every time a new subsidiary entity is encountered, it assigns a new group or\n",
+    "        \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n",
+    "        entities in between these subsidiary groups are assigned to a subsidiary row/group.\n",
+    "        Finally, this is all formatted into a dataframe with an ID column from the original\n",
+    "        filename and a basic cleaning function normalizes strings.\n",
+    "        \"\"\"\n",
+    "        # TODO: when model more mature, break this into sub functions to make it\n",
+    "        # clearer what's going on\n",
+    "        predictions = all_outputs[\"predictions\"]\n",
+    "        encoding = all_outputs[\"raw_encoding\"]\n",
+    "        doc_dict = all_outputs[\"doc_dict\"]\n",
+    "\n",
+    "        token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n",
+    "        predictions_tensor = torch.tensor(predictions)\n",
+    "        mode_predictions = get_flattened_mode_predictions(\n",
+    "            token_boxes_tensor, predictions_tensor\n",
+    "        )\n",
+    "        token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n",
+    "        predicted_labels = [\n",
+    "            self.model.config.id2label[pred] for pred in mode_predictions\n",
+    "        ]\n",
+    "        simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n",
+    "\n",
+    "        df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n",
+    "        df.loc[:, \"iob_pred\"] = predicted_labels\n",
+    "        df.loc[:, \"pred\"] = simple_preds\n",
+    "        invalid_mask = (\n",
+    "            (df[\"top_left_x\"] == 0)\n",
+    "            & (df[\"top_left_y\"] == 0)\n",
+    "            & (df[\"bottom_right_x\"] == 0)\n",
+    "            & (df[\"bottom_right_y\"] == 0)\n",
+    "        )\n",
+    "        df = df[~invalid_mask]\n",
+    "        # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n",
+    "        # subwords from the same word share the same bounding box coordinates\n",
+    "        # so we merge the original words onto our dataframe on bbox coordinates\n",
+    "        words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n",
+    "        words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n",
+    "        df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n",
+    "            subset=BBOX_COLS + [\"pred\", \"word\"]\n",
+    "        )\n",
+    "        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n",
+    "        # should always have a B entity label. Manually override labels so this is true.\n",
+    "        first_in_group_df = df[\n",
+    "            (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n",
+    "        ]\n",
+    "        first_in_group_df.loc[:, \"iob_pred\"] = (\n",
+    "            \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n",
+    "        )\n",
+    "        df.update(first_in_group_df)\n",
+    "        # filter for just words that were labeled with non \"other\" entities\n",
+    "        entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
+    "        entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n",
+    "        # words are labeled with IOB format which stands for inside, outside, beginning\n",
+    "        # merge B and I entities to form one entity group\n",
+    "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
+    "        entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
+    "        grouped_df = (\n",
+    "            entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n",
+    "            .apply(\" \".join)\n",
+    "            .reset_index()[[\"pred\", \"word\"]]\n",
+    "        )\n",
+    "        # assign a new row every time there's a new subsidiary\n",
+    "        grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n",
+    "        output_df = grouped_df.pivot_table(\n",
+    "            index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n",
+    "        ).reset_index()\n",
+    "        if output_df.empty:\n",
+    "            return output_df\n",
+    "        output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n",
+    "        return output_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f",
+   "metadata": {},
+   "source": [
+    "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff844a110fb04ddcbe788e647651786c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/09/24 20:17:30 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "004ac3503c77461f9ce7938949a660c5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/09/24 20:17:52 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/09/24 20:17:52 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev304+g07d500a) contains a local version label (+g07d500a). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev304' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
+      "2024/09/24 20:17:53 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "07a654fa7c914b338b0e9fbc36d48bdd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from PIL import Image\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.entities import (\n",
+    "    Ex21CompanyOwnership,\n",
+    "    Sec10kExtractionMetadata,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def clean_extracted_df(extracted_df):\n",
+    "    \"\"\"Perform basic cleaning on a dataframe extracted from an Ex. 21.\"\"\"\n",
+    "    if extracted_df.empty:\n",
+    "        return extracted_df\n",
+    "    if \"row\" in extracted_df.columns:\n",
+    "        extracted_df = extracted_df.drop(columns=[\"row\"])\n",
+    "    extracted_df[\"subsidiary\"] = extracted_df[\"subsidiary\"].str.strip().str.lower()\n",
+    "    # strip special chars from the start and end of the string\n",
+    "    extracted_df[\"subsidiary\"] = extracted_df[\"subsidiary\"].str.replace(\n",
+    "        r\"^[^\\w&\\s]+|[^\\w&\\s]+$\", \"\", regex=True\n",
+    "    )\n",
+    "    if \"loc\" in extracted_df.columns:\n",
+    "        extracted_df[\"loc\"] = extracted_df[\"loc\"].str.strip().str.lower()\n",
+    "        extracted_df[\"loc\"] = extracted_df[\"loc\"].str.replace(\n",
+    "            r\"[^a-zA-Z&,\\s]\", \"\", regex=True\n",
+    "        )\n",
+    "    if \"own_per\" in extracted_df.columns:\n",
+    "        # remove special chars and letters\n",
+    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].str.replace(\n",
+    "            r\"[^\\d.]\", \"\", regex=True\n",
+    "        )\n",
+    "        # Find values with multiple decimal points\n",
+    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].str.replace(\n",
+    "            r\"(\\d*\\.\\d+)\\..*\", r\"\\1\", regex=True\n",
+    "        )\n",
+    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].replace(\"\", np.nan)\n",
+    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].astype(\n",
+    "            \"float64\", errors=\"ignore\"\n",
+    "        )\n",
+    "    # drop rows that have a null subsidiary value\n",
+    "    extracted_df = extracted_df.dropna(subset=\"subsidiary\")\n",
+    "    return extracted_df\n",
+    "\n",
+    "# If a model was trained in this notebook, use it. Otherwise, use\n",
+    "if training_run_id is not None:\n",
+    "    model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n",
+    "else:\n",
+    "    model_uri = context.op_config[\"uri\"]\n",
+    "\n",
+    "model_info = mlflow.models.get_model_info(model_uri)\n",
+    "\n",
+    "def _get_data(dataset):\n",
+    "    yield from dataset\n",
+    "\n",
+    "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n",
+    "    \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n",
+    "    def load_context(self, context):\n",
+    "        \"\"\"Load pretrained model.\"\"\"\n",
+    "        os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+    "        self.model_components = mlflow.transformers.load_model(\n",
+    "            context.artifacts[\"model_components\"], return_type=\"components\"\n",
+    "        )\n",
+    "\n",
+    "    def predict(self, context, model_input: pd.DataFrame, params=None):\n",
+    "        \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n",
+    "        # Convert dataframe to pyarrow Dataset\n",
+    "        model_input[\"image\"] = model_input.apply(\n",
+    "            lambda row: Image.frombytes(\n",
+    "                row[\"mode\"], (row[\"width\"], row[\"height\"]), row[\"image\"]\n",
+    "            ),\n",
+    "            axis=1,\n",
+    "        )\n",
+    "        dataset = Dataset.from_list(model_input.drop([\"mode\", \"width\", \"height\"], axis=1).to_dict(\"records\"))\n",
+    "\n",
+    "        # TODO: figure out device argument\n",
+    "        pipe = pipeline(\n",
+    "            \"token-classification\",\n",
+    "            model=self.model_components[\"model\"],\n",
+    "            tokenizer=self.model_components[\"tokenizer\"],\n",
+    "            pipeline_class=LayoutLMInferencePipeline,\n",
+    "        )\n",
+    "\n",
+    "        logits = []\n",
+    "        predictions = []\n",
+    "        all_output_df = Ex21CompanyOwnership.example(size=0)\n",
+    "        extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n",
+    "        for logit, pred, output_df in pipe(_get_data(dataset)):\n",
+    "            logits.append(logit)\n",
+    "            predictions.append(pred)\n",
+    "            if not output_df.empty:\n",
+    "                filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n",
+    "                extraction_metadata.loc[filename, [\"success\"]] = True\n",
+    "            all_output_df = pd.concat([all_output_df, output_df])\n",
+    "        all_output_df.columns.name = None\n",
+    "        all_output_df = clean_extracted_df(all_output_df)\n",
+    "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n",
+    "        all_output_df = all_output_df.reset_index(drop=True)\n",
+    "        return extraction_metadata, all_output_df\n",
+    "\n",
+    "# Save model to local temp dir with artifacts, then reload for evaluation\n",
+    "with TemporaryDirectory() as tmp_dir:\n",
+    "    mlflow.pyfunc.save_model(\n",
+    "        path=tmp_dir,\n",
+    "        python_model=Ex21Extractor(),\n",
+    "        artifacts={\"model_components\": model_uri},\n",
+    "    )\n",
+    "    ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d",
+   "metadata": {},
+   "source": [
+    "### Model Evaluation\n",
+    "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "47c19b41-131f-4059-8f42-931237565a20",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n",
+    "    \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n",
+    "    validation_df = validation_df.rename(\n",
+    "        columns={\n",
+    "            \"Filename\": \"id\",\n",
+    "            \"Subsidiary\": \"subsidiary\",\n",
+    "            \"Location of Incorporation\": \"loc\",\n",
+    "            \"Ownership Percentage\": \"own_per\",\n",
+    "        }\n",
+    "    )\n",
+    "    validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n",
+    "    validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n",
+    "    validation_df = clean_extracted_df(validation_df)\n",
+    "    return validation_df\n",
+    "\n",
+    "# Load labeled validation set\n",
+    "validation_set = clean_ex21_validation_set(\n",
+    "    validation_helpers.load_validation_data(\"ex21_labels.csv\")\n",
+    ")\n",
+    "\n",
+    "# Get filing metadata for filings in validation set\n",
+    "cloud_interface = GCSArchive()\n",
+    "filing_metadata = cloud_interface.get_metadata()\n",
+    "ex21_validation_filing_metadata = filing_metadata[\n",
+    "    filing_metadata.index.isin(validation_set[\"filename\"].unique())\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",
+   "metadata": {},
+   "source": [
+    "Next define methods evaluating model output, then run extraction and log in child run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/09/24 20:18:01 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:51: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:44: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "02516db30cd241ed97c08df920368bf8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/09/24 20:19:33 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/09/24 20:19:33 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev304+g07d500a) contains a local version label (+g07d500a). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev304' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
+      "2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-finch-744 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/b959cfa0ba3c4b91a0f8fe158cd0109f.\n",
+      "2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
+      "2024/09/24 20:19:41 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/09/24 20:19:42 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mlflow.models import infer_signature\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n",
+    "\n",
+    "\n",
+    "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n",
+    "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
+    "    shared_cols = validation_df.columns.intersection(computed_df.columns)\n",
+    "    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n",
+    "    n_equal = 0\n",
+    "    validation_filenames = validation_df[\"id\"].unique()\n",
+    "    n_files = len(validation_filenames)\n",
+    "    table_metrics_dict = {}\n",
+    "    jaccard_dict = {}\n",
+    "    incorrect_files = []\n",
+    "    # iterate through each file and check each extracted table\n",
+    "    for filename in validation_filenames:\n",
+    "        extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n",
+    "            drop=True\n",
+    "        )\n",
+    "        validation_table_df = validation_df[\n",
+    "            validation_df[\"id\"] == filename\n",
+    "        ].reset_index(drop=True)\n",
+    "        # check if the tables are exactly equal\n",
+    "        if extracted_table_df.equals(validation_table_df):\n",
+    "            # TODO: strip llc and other company strings before comparison\n",
+    "            n_equal += 1\n",
+    "        else:\n",
+    "            incorrect_files.append(filename)\n",
+    "        # compute precision and recall for each column\n",
+    "        table_metrics_dict[filename] = {}\n",
+    "        jaccard_dict[filename] = {}\n",
+    "        for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
+    "            table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n",
+    "                extracted_table_df, validation_table_df, value_col=col\n",
+    "            )\n",
+    "            table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n",
+    "                \"precision\"\n",
+    "            ]\n",
+    "            table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n",
+    "            # get the jaccard similarity between columns\n",
+    "            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n",
+    "                computed_df=extracted_table_df,\n",
+    "                validation_df=validation_table_df,\n",
+    "                value_col=col,\n",
+    "            )\n",
+    "\n",
+    "    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n",
+    "    prec_recall_df = pd.DataFrame.from_dict(\n",
+    "        table_metrics_dict, orient=\"index\"\n",
+    "    ).reset_index()\n",
+    "\n",
+    "    return (\n",
+    "        jaccard_df,\n",
+    "        prec_recall_df,\n",
+    "        pd.DataFrame({\"filename\": incorrect_files}),\n",
+    "        {\n",
+    "            \"table_accuracy\": n_equal / n_files,\n",
+    "            \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n",
+    "            \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n",
+    "            \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n",
+    "            \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n",
+    "            / n_files,\n",
+    "            \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n",
+    "            \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n",
+    "            / n_files,\n",
+    "            \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n",
+    "            / n_files,\n",
+    "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
+    "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
+    "    failed_metadata, dataset = create_inference_dataset(\n",
+    "        filing_metadata=ex21_validation_filing_metadata,\n",
+    "        cloud_interface=cloud_interface,\n",
+    "        has_labels=False,\n",
+    "    )\n",
+    "    metadata, extracted = ex21_extraction_model.predict(dataset.copy())\n",
+    "    metadata = pd.concat([failed_metadata, metadata])\n",
+    "\n",
+    "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n",
+    "    mlflow.log_metrics(metrics)\n",
+    "    mlflow.pyfunc.log_model(\n",
+    "        \"exhibit21_extractor\",\n",
+    "        python_model=Ex21Extractor(),\n",
+    "        artifacts={\"model_components\": model_uri},\n",
+    "        signature=infer_signature(dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb
deleted file mode 100644
index 5c33d22..0000000
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb
+++ /dev/null
@@ -1,1045 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0da8c588-2d09-464b-945f-168704c0cdac",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Exhibit 21 extraction\n",
-    "\n",
-    "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n",
-    "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n",
-    "company."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "import dagstermill\n",
-    "\n",
-    "context = dagstermill.get_context(op_config={\n",
-    "    \"train_model\": True,\n",
-    "    \"model_version\": \"latest\",\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7f299b2b-2358-4526-b023-f29c817316d9",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Train Layoutlmv3"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### Setup training/test sets"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b",
-   "metadata": {},
-   "source": [
-    "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n",
-    "\n",
-    "First define several helper functions to do the conversion."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "from tempfile import TemporaryDirectory\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from mozilla_sec_eia.library import validation_helpers\n",
-    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n",
-    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n",
-    "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n",
-    "    get_pdf_data_from_path,\n",
-    "    render_page,\n",
-    ")\n",
-    "\n",
-    "# Set some constants\n",
-    "LABELS = [\n",
-    "    \"O\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"I-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"I-Own_Per\",\n",
-    "]\n",
-    "LABEL_PRIORITY = [\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"I-Loc\",\n",
-    "    \"I-Own_Per\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"O\",\n",
-    "]\n",
-    "\n",
-    "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n",
-    "BBOX_COLS_PDF = [\n",
-    "    \"top_left_x_pdf\",\n",
-    "    \"top_left_y_pdf\",\n",
-    "    \"bottom_right_x_pdf\",\n",
-    "    \"bottom_right_y_pdf\",\n",
-    "]\n",
-    "\n",
-    "# Map back and forth between id's and labels\n",
-    "id2label = dict(enumerate(LABELS))\n",
-    "label2id = {v: k for k, v in enumerate(LABELS)}\n",
-    "\n",
-    "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n",
-    "    # TODO: for now CIK is stored as an int, update when fixed\n",
-    "    cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n",
-    "    return cik in tracking_df.CIK.unique()\n",
-    "\n",
-    "\n",
-    "def format_label_studio_output(\n",
-    "    labeled_json_dir: Path,\n",
-    "    pdfs_dir: Path,\n",
-    ") -> pd.DataFrame:\n",
-    "    \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n",
-    "    labeled_df = pd.DataFrame()\n",
-    "    # TODO: make this path stuff less janky?\n",
-    "    tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n",
-    "    for json_filename in os.listdir(labeled_json_dir):\n",
-    "        if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n",
-    "            continue\n",
-    "        json_file_path = labeled_json_dir / json_filename\n",
-    "        with Path.open(json_file_path) as j:\n",
-    "            doc_dict = json.loads(j.read())\n",
-    "\n",
-    "        filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n",
-    "        # check if old local naming schema is being used\n",
-    "        if len(filename.split(\"-\")) == 6:\n",
-    "            filename = \"-\".join(filename.split(\"-\")[2:])\n",
-    "        if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n",
-    "            continue\n",
-    "\n",
-    "        pdf_filename = filename + \".pdf\"\n",
-    "        src_path = pdfs_dir / pdf_filename\n",
-    "        extracted, pg = get_pdf_data_from_path(src_path)\n",
-    "        txt = extracted[\"pdf_text\"]\n",
-    "        pg_meta = extracted[\"page\"]\n",
-    "        # normalize bboxes between 0 and 1000 for Hugging Face\n",
-    "        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n",
-    "        # parse the output dictionary of labeled bounding boxes from Label Studio\n",
-    "        doc_df = pd.DataFrame()\n",
-    "        for item in doc_dict[\"result\"]:\n",
-    "            value = item[\"value\"]\n",
-    "            # sometimes Label Studio will fill in an empty list as a label\n",
-    "            # when there is really no label\n",
-    "            # TODO: do this without dict comprehension?\n",
-    "            if (\"labels\" in value) and value[\"labels\"] == []:\n",
-    "                value = {k: v for k, v in value.items() if k != \"labels\"}\n",
-    "            ind = int(item[\"id\"].split(\"_\")[-1])\n",
-    "            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n",
-    "\n",
-    "        # combine the bounding boxes for each word\n",
-    "        doc_df = doc_df.groupby(level=0).first()\n",
-    "        txt.loc[:, \"id\"] = filename\n",
-    "        # TODO: probably want to filter out these empty Ex. 21 docs\n",
-    "        # the doc might not have any labels in it if it was an empty Ex. 21\n",
-    "        if \"labels\" not in doc_df:\n",
-    "            doc_df.loc[:, \"labels\"] = pd.Series()\n",
-    "\n",
-    "        output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n",
-    "        labeled_df = pd.concat([labeled_df, output_df])\n",
-    "\n",
-    "    # fill in unlabeled words and clean up labeled dataframe\n",
-    "    labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n",
-    "    labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n",
-    "    non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n",
-    "    labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n",
-    "\n",
-    "    # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n",
-    "    # that no value is above 1000 or below 0\n",
-    "\n",
-    "    return labeled_df\n",
-    "\n",
-    "\n",
-    "def get_image_dict(pdfs_dir):\n",
-    "    \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n",
-    "    image_dict = {}\n",
-    "    for pdf_filename in os.listdir(pdfs_dir):\n",
-    "        if pdf_filename.split(\".\")[-1] != \"pdf\":\n",
-    "            continue\n",
-    "        pdf_file_path = pdfs_dir / pdf_filename\n",
-    "        _, pg = get_pdf_data_from_path(pdf_file_path)\n",
-    "        full_pg_img = render_page(pg)\n",
-    "        filename = pdf_filename.split(\".\")[0]\n",
-    "        image_dict[filename] = full_pg_img\n",
-    "    return image_dict\n",
-    "\n",
-    "\n",
-    "def format_as_ner_annotations(\n",
-    "    labeled_json_path: Path,\n",
-    "    pdfs_path: Path,\n",
-    "    gcs_folder_name: Path,\n",
-    ") -> list[dict]:\n",
-    "    \"\"\"Format a Label Studio output JSONs as NER annotations.\n",
-    "\n",
-    "    Formats the dataframe as named entity recognition annotations.\n",
-    "    # TODO: say more about this format\n",
-    "\n",
-    "    Returns:\n",
-    "        ner_annotations: a list of dicts, with one dict for each doc.\n",
-    "    \"\"\"\n",
-    "    GCSArchive().cache_training_data(\n",
-    "        json_cache_path=labeled_json_path,\n",
-    "        pdf_cache_path=pdfs_path,\n",
-    "        gcs_folder_name=gcs_folder_name\n",
-    "    )\n",
-    "\n",
-    "    labeled_df = format_label_studio_output(\n",
-    "        labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n",
-    "    )\n",
-    "    # convert dataframe/dictionary into NER format\n",
-    "    # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n",
-    "    # complete dataset is a list of dicts, with one dict for each doc\n",
-    "    doc_filenames = labeled_df[\"id\"].unique()\n",
-    "    image_dict = get_image_dict(pdfs_dir=pdfs_path)\n",
-    "    ner_annotations = []\n",
-    "    for filename in doc_filenames:\n",
-    "        annotation = {\n",
-    "            \"id\": filename,\n",
-    "            \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n",
-    "            \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n",
-    "            \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n",
-    "            .to_numpy()\n",
-    "            .tolist(),\n",
-    "            \"image\": image_dict[filename],\n",
-    "        }\n",
-    "        ner_annotations.append(annotation)\n",
-    "\n",
-    "    return ner_annotations\n",
-    "\n",
-    "def _prepare_dataset(annotations, processor, label2id):\n",
-    "    \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
-    "\n",
-    "    def _convert_ner_tags_to_id(ner_tags, label2id):\n",
-    "        return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
-    "\n",
-    "    images = annotations[\"image\"]\n",
-    "    words = annotations[\"tokens\"]\n",
-    "    boxes = annotations[\"bboxes\"]\n",
-    "    # Map over labels and convert to numeric id for each ner_tag\n",
-    "    ner_tags = [\n",
-    "        _convert_ner_tags_to_id(ner_tags, label2id)\n",
-    "        for ner_tags in annotations[\"ner_tags\"]\n",
-    "    ]\n",
-    "\n",
-    "    encoding = processor(\n",
-    "        images,\n",
-    "        words,\n",
-    "        boxes=boxes,\n",
-    "        word_labels=ner_tags,\n",
-    "        truncation=True,\n",
-    "        padding=\"max_length\",\n",
-    "    )\n",
-    "\n",
-    "    return encoding\n",
-    "\n",
-    "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n",
-    "    \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n",
-    "    predictions, labels = p\n",
-    "    predictions = np.argmax(predictions, axis=2)\n",
-    "\n",
-    "    # Remove ignored index (special tokens)\n",
-    "    true_predictions = [\n",
-    "        [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
-    "        for prediction, label in zip(predictions, labels)\n",
-    "    ]\n",
-    "    true_labels = [\n",
-    "        [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
-    "        for prediction, label in zip(predictions, labels)\n",
-    "    ]\n",
-    "\n",
-    "    results = metric.compute(predictions=true_predictions, references=true_labels)\n",
-    "    if return_entity_level_metrics:\n",
-    "        # Unpack nested dictionaries\n",
-    "        final_results = {}\n",
-    "        for key, value in results.items():\n",
-    "            if isinstance(value, dict):\n",
-    "                for n, v in value.items():\n",
-    "                    final_results[f\"{key}_{n}\"] = v\n",
-    "            else:\n",
-    "                final_results[key] = value\n",
-    "        return final_results\n",
-    "    return {\n",
-    "        \"precision\": results[\"overall_precision\"],\n",
-    "        \"recall\": results[\"overall_recall\"],\n",
-    "        \"f1\": results[\"overall_f1\"],\n",
-    "        \"accuracy\": results[\"overall_accuracy\"],\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8160263c-8f69-437c-918b-e56ad007961a",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Finetune Model\n",
-    "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n",
-    "\n",
-    "Model training contains several steps implemented below:\n",
-    "1. Use temporary path to convert filings to PDF's and stash labels\n",
-    "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n",
-    "3. Construct huggingface dataset from NER annotations and split into train and test sets\n",
-    "4. Load pretrained model from huggingface\n",
-    "5. Finetune model on training data and evaluate on test data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<table> is empty\n",
-      "'<c> The Southwest Companies Nevada PriMerit Bank Federally chartered stock savings bank Paiute Pipeline Company Nevada Carson Water Company Nevada Southwest Gas Transmission Company Partnership between Southwest Gas Corporation and Utility Financial Corp. Utility Financial Corp. Nevada Southwest Gas Corporation of Arizona Nevada PRIMERIT BANK SUBSIDIARIES AT DECEMBER 31, 1993'\n",
-      "<table> is empty\n",
-      "'<c> TCA Management Company.................................................... Texas Teleservice Corporation of America........................................ Texas Texas Community Antennas, Inc............................................. Texas Texas Telecable, Inc...................................................... Texas TCA Cable of Amarillo, Inc................................................ Texas Telecable Associates, Inc................................................. Texas Delta Cablevision, Inc.................................................... Arkansas Sun Valley Cablevision, Inc............................................... Idaho VPI Communications, Inc................................................... Texas AvComm Corporation........................................................ Texas Tele-Communications of Arkansas L. P......................................'\n",
-      "<table> is empty\n",
-      "'<c> DOMESTIC SUBSIDIARIES International Sales &amp; Business, Inc. California KLA-Tencor Building Corporation California KLA-Tencor Disc Corporation California KLA-Tencor International Corporation California KLA-Tencor Klinnik Corporation California KLA-Tencor Management Corporation California KLA-Tencor (Thailand Branch) Corporation California VLSI Standards, Inc. California Amray, Inc. Delaware Groff Associates, Inc. California DeviceWare, Inc. California INTERNATIONAL SUBSIDIARIES'\n",
-      "<table> is empty\n",
-      "'<c> 1. Northeast Energy, LLC (100%-Owned) .................................................... Florida 2. Northeast Energy Associates, A Limited Partnership (99%-Owned) (a) .................... Massachusetts 3. North Jersey Energy Associates, A Limited Partnership (99%-Owned) (a) ................. New Jersey (a) Northeast Energy, LLC owns the remaining 1% interest. </c>'\n",
-      "<table> is empty\n",
-      "'<c> 1. ESI Tractebel Urban Renewal Corporation (100%-Owned) .................................. New Jersey </c>'\n",
-      "<table> is empty\n",
-      "'<c> IVANHOE ENERGY HOLDINGS INC. (Nevada) 100% IVANHOE ENERGY (USA) INC. (Nevada) 100% (indirect) IVANHOE ENERGY ROYALTY INC. (Nevada) 100% (indirect) IVANHOE ENERGY INTERNATIONAL VENTURES INC. (BVI) 100% Ivanhoe Energy Sweetwater Limited (Malta) 100% (Indirect) Ivanhoe Energy (Qatar) Inc. (BVI) 100% (Indirect) GTL Japan Corporation (Japan) 100% (Indirect) IVANHOE ENERGY'\n",
-      "<table> is empty\n",
-      "'<c> Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Mid America, Inc. DE Airgas Mid South, Inc. DE Airgas Nor Pac, Inc. DE'\n",
-      "<table> is empty\n",
-      "'<c> Subsidiary Name State of Formation - --------------- ------------------- American Ecology Environmental Services Corporation Texas Corporation American Ecology Holdings Corporation Delaware Corporation American Ecology Recycle Center, Inc. Delaware Corporation American Ecology Services Corporation Delaware Corporation Texas Ecologists, Inc. Texas Corporation US Ecology, Inc. California Corporation US Ecology Idaho, Inc. Delaware'\n",
-      "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bae617cb831d4b2593c0fa4a874f1592",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map:   0%|          | 0/159 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
-      "  warnings.warn(\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "max_steps is given, it will override any value given in num_train_epochs\n",
-      "2024/09/23 14:14:48 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='2' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [   2/1000 : < :, Epoch 0.01/8]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       " <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "      <th>Validation Loss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-trout-555 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/573e64992704411c9013937d849e1504.\n",
-      "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
-      "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
-     ]
-    },
-    {
-     "ename": "OutOfMemoryError",
-     "evalue": "CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 94\u001b[0m\n\u001b[1;32m     91\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mset_experiment(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexhibit21_extraction_test\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run():\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m---> 94\u001b[0m     \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m     97\u001b[0m     mlflow\u001b[38;5;241m.\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mlog_model(\n\u001b[1;32m     98\u001b[0m         trainer, artifact_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m, task\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken-classification\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     99\u001b[0m     )\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1936\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1939\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1940\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1941\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1942\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1943\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2279\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2276\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2278\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2279\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2282\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2283\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2284\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2285\u001b[0m ):\n\u001b[1;32m   2286\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2287\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3318\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3315\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3317\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3318\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3320\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m   3321\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   3322\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   3323\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m   3324\u001b[0m ):\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3363\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3362\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3363\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3364\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3365\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:1099\u001b[0m, in \u001b[0;36mLayoutLMv3ForTokenClassification.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)\u001b[0m\n\u001b[1;32m   1069\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1070\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1071\u001b[0m \u001b[38;5;124;03m    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1095\u001b[0m \u001b[38;5;124;03m>>> logits = outputs.logits\u001b[39;00m\n\u001b[1;32m   1096\u001b[0m \u001b[38;5;124;03m```\"\"\"\u001b[39;00m\n\u001b[1;32m   1097\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1099\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayoutlmv3\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1100\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1101\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1104\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1105\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1106\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1107\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1108\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1109\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1110\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1111\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1113\u001b[0m     input_shape \u001b[38;5;241m=\u001b[39m input_ids\u001b[38;5;241m.\u001b[39msize()\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:975\u001b[0m, in \u001b[0;36mLayoutLMv3Model.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    968\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m    969\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m    970\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m    971\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m    972\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m    973\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m--> 975\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    976\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    977\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_bbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    978\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_position_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    979\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    980\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    981\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    982\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    983\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    984\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpatch_height\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_height\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    985\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpatch_width\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_width\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    986\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    990\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:681\u001b[0m, in \u001b[0;36mLayoutLMv3Encoder.forward\u001b[0;34m(self, hidden_states, bbox, attention_mask, head_mask, output_attentions, output_hidden_states, return_dict, position_ids, patch_height, patch_width)\u001b[0m\n\u001b[1;32m    671\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    672\u001b[0m         layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    673\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    678\u001b[0m         rel_2d_pos,\n\u001b[1;32m    679\u001b[0m     )\n\u001b[1;32m    680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    682\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    683\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    684\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    685\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    686\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    687\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    688\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    690\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:532\u001b[0m, in \u001b[0;36mLayoutLMv3Layer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    523\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    524\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    525\u001b[0m     hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    530\u001b[0m     rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    531\u001b[0m ):\n\u001b[0;32m--> 532\u001b[0m     self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    533\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    534\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    535\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    536\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    537\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    538\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    539\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    540\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    542\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add self attentions if we output attention weights\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:500\u001b[0m, in \u001b[0;36mLayoutLMv3Attention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    492\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    493\u001b[0m     hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    498\u001b[0m     rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    499\u001b[0m ):\n\u001b[0;32m--> 500\u001b[0m     self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    501\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    502\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    503\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    504\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    505\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    506\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    507\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    508\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m    509\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:448\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    444\u001b[0m     attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m+\u001b[39m attention_mask\n\u001b[1;32m    446\u001b[0m \u001b[38;5;66;03m# Normalize the attention scores to probabilities.\u001b[39;00m\n\u001b[1;32m    447\u001b[0m \u001b[38;5;66;03m# Use the trick of the CogView paper to stablize training\u001b[39;00m\n\u001b[0;32m--> 448\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcogview_attention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattention_scores\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    450\u001b[0m \u001b[38;5;66;03m# This is actually dropping out entire tokens to attend to, which might\u001b[39;00m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;66;03m# seem a bit unusual, but is taken from the original Transformer paper.\u001b[39;00m\n\u001b[1;32m    452\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(attention_probs)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:414\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.cogview_attention\u001b[0;34m(self, attention_scores, alpha)\u001b[0m\n\u001b[1;32m    412\u001b[0m scaled_attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m/\u001b[39m alpha\n\u001b[1;32m    413\u001b[0m max_value \u001b[38;5;241m=\u001b[39m scaled_attention_scores\u001b[38;5;241m.\u001b[39mamax(dim\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 414\u001b[0m new_attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43m(\u001b[49m\u001b[43mscaled_attention_scores\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_value\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\n\u001b[1;32m    415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m nn\u001b[38;5;241m.\u001b[39mSoftmax(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)(new_attention_scores)\n",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
-     ]
-    }
-   ],
-   "source": [
-    "import mlflow\n",
-    "from datasets import (\n",
-    "    Array2D,\n",
-    "    Array3D,\n",
-    "    Dataset,\n",
-    "    Features,\n",
-    "    Sequence,\n",
-    "    Value,\n",
-    "    load_metric,\n",
-    ")\n",
-    "from dotenv import load_dotenv\n",
-    "from transformers import (\n",
-    "    AutoProcessor,\n",
-    "    LayoutLMv3ForTokenClassification,\n",
-    "    Trainer,\n",
-    "    TrainingArguments,\n",
-    ")\n",
-    "from transformers.data.data_collator import default_data_collator\n",
-    "\n",
-    "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
-    "\n",
-    "load_dotenv()\n",
-    "\n",
-    "\n",
-    "configure_mlflow()\n",
-    "\n",
-    "# Only finetune if configured to do so\n",
-    "if context.op_config[\"train_model\"]:\n",
-    "    # Change temp_dir to save training data locally for inspection\n",
-    "    with TemporaryDirectory() as temp_dir:\n",
-    "        ner_annotations = format_as_ner_annotations(\n",
-    "            labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n",
-    "            pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n",
-    "            gcs_folder_name=\"labeledv0.2/\",\n",
-    "        )\n",
-    "\n",
-    "    # Cache/prepare training data\n",
-    "    dataset = Dataset.from_list(ner_annotations)\n",
-    "\n",
-    "    # Load pretrained model\n",
-    "    model = LayoutLMv3ForTokenClassification.from_pretrained(\n",
-    "        \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n",
-    "    )\n",
-    "    processor = AutoProcessor.from_pretrained(\n",
-    "        \"microsoft/layoutlmv3-base\", apply_ocr=False\n",
-    "    )\n",
-    "\n",
-    "    # Prepare our train & eval dataset\n",
-    "    column_names = dataset.column_names\n",
-    "    features = Features(\n",
-    "        {\n",
-    "            \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n",
-    "            \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n",
-    "            \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n",
-    "            \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n",
-    "            \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n",
-    "        }\n",
-    "    )\n",
-    "    dataset = dataset.map(\n",
-    "        lambda annotations: _prepare_dataset(annotations, processor, label2id),\n",
-    "        batched=True,\n",
-    "        remove_columns=column_names,\n",
-    "        features=features,\n",
-    "    )\n",
-    "    dataset.set_format(\"torch\")\n",
-    "    split_dataset = dataset.train_test_split(test_size=0.2)\n",
-    "    train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n",
-    "\n",
-    "    # Initialize our Trainer\n",
-    "    metric = load_metric(\"seqeval\")\n",
-    "    training_args = TrainingArguments(\n",
-    "        max_steps=1000,\n",
-    "        per_device_train_batch_size=1,\n",
-    "        per_device_eval_batch_size=1,\n",
-    "        learning_rate=1e-5,\n",
-    "        evaluation_strategy=\"steps\",\n",
-    "        eval_steps=100,\n",
-    "        load_best_model_at_end=True,\n",
-    "        metric_for_best_model=\"f1\",\n",
-    "        output_dir=\"./layoutlm\",\n",
-    "    )\n",
-    "    trainer = Trainer(\n",
-    "        model=model,\n",
-    "        args=training_args,\n",
-    "        train_dataset=train_dataset,\n",
-    "        eval_dataset=eval_dataset,\n",
-    "        tokenizer=processor,\n",
-    "        data_collator=default_data_collator,\n",
-    "        compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n",
-    "    )\n",
-    "\n",
-    "    mlflow.set_experiment(\"exhibit21_extraction_test\")\n",
-    "    with mlflow.start_run():\n",
-    "        # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n",
-    "        trainer.train()\n",
-    "\n",
-    "        # Log finetuend model with mlflow\n",
-    "        mlflow.transformers.log_model(\n",
-    "            trainer, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n",
-    "        )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e",
-   "metadata": {},
-   "source": [
-    "## Model inference\n",
-    "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42c8e920-d671-40c2-b5db-c43611a33897",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from transformers import Pipeline, pipeline\n",
-    "from transformers.tokenization_utils_base import BatchEncoding\n",
-    "\n",
-    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n",
-    "    iob_to_label,\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def _sort_by_label_priority(target_array):\n",
-    "    id_priority = [label2id[label] for label in LABEL_PRIORITY]\n",
-    "    # Create a priority map from the label priority\n",
-    "    priority_map = {val: idx for idx, val in enumerate(id_priority)}\n",
-    "    # Sort the target array based on the priority map\n",
-    "    sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float(\"inf\")))\n",
-    "    return sorted_array\n",
-    "\n",
-    "def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):\n",
-    "    \"\"\"Get the mode prediction for each box in an Ex. 21.\n",
-    "\n",
-    "    When handling multi page documents LayoutLM uses a sliding 'frame'\n",
-    "    with some overlap between frames. The overlap creates multiple\n",
-    "    predictions for the same bounding boxes. Thus it's necessary to find\n",
-    "    the mode of all the predictions for a bounding box and use that as the\n",
-    "    single prediction for each box. If there are multiple mode\n",
-    "    predictions for a bounding box, then ties are broken by setting\n",
-    "    a priority for the labels (LABEL_PRIORITY) and choosing the highest priority\n",
-    "    label.\n",
-    "    \"\"\"\n",
-    "    # Flatten the tensors\n",
-    "    flat_token_boxes = token_boxes_tensor.view(-1, 4)\n",
-    "    flat_predictions = predictions_tensor.view(-1)\n",
-    "\n",
-    "    boxes = flat_token_boxes.numpy()\n",
-    "    predictions = flat_predictions.numpy()\n",
-    "\n",
-    "    # Find unique boxes and indices\n",
-    "    unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)\n",
-    "\n",
-    "    # Compute the mode for each unique bounding box\n",
-    "    # for each unique box in boxes, create a list with all predictions for that box\n",
-    "    # get the indices in predictions where the corresponding index in boxes is\n",
-    "    unique_box_predictions = [\n",
-    "        predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))\n",
-    "    ]\n",
-    "    pred_counts = [np.bincount(arr) for arr in unique_box_predictions]\n",
-    "    # Compute the mode of predictions for each group\n",
-    "    # break ties by taking into account LABEL_PRIORITY\n",
-    "    modes = np.array(\n",
-    "        [\n",
-    "            _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]\n",
-    "            for arr in pred_counts\n",
-    "        ]\n",
-    "    )\n",
-    "    flattened_modes = modes[inverse_indices]\n",
-    "\n",
-    "    return flattened_modes\n",
-    "\n",
-    "class LayoutLMInferencePipeline(Pipeline):\n",
-    "    \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
-    "\n",
-    "    def __init__(self, *args, **kwargs):\n",
-    "        \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n",
-    "        super().__init__(*args, **kwargs)\n",
-    "\n",
-    "    def _sanitize_parameters(self, **kwargs):\n",
-    "        preprocess_kwargs = {}\n",
-    "        if \"maybe_arg\" in kwargs:\n",
-    "            preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n",
-    "        return preprocess_kwargs, {}, {}\n",
-    "\n",
-    "    def preprocess(self, doc_dict):\n",
-    "        \"\"\"Encode and tokenize model inputs.\"\"\"\n",
-    "        image = doc_dict[\"image\"]\n",
-    "        words = doc_dict[\"tokens\"]\n",
-    "        boxes = doc_dict[\"bboxes\"]\n",
-    "        encoding = self.tokenizer(\n",
-    "            image,\n",
-    "            words,\n",
-    "            boxes=boxes,\n",
-    "            return_tensors=\"pt\",\n",
-    "            truncation=True,\n",
-    "            padding=\"max_length\",\n",
-    "            max_length=512,  # this is the maximum max_length\n",
-    "            stride=128,\n",
-    "            return_offsets_mapping=True,\n",
-    "            return_overflowing_tokens=True,\n",
-    "        )\n",
-    "        model_inputs = {}\n",
-    "        model_inputs[\"raw_encoding\"] = encoding.copy()\n",
-    "        model_inputs[\"doc_dict\"] = doc_dict\n",
-    "        model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n",
-    "        model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n",
-    "        # TODO: do we actually need to make these into ints?\n",
-    "        encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n",
-    "        encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n",
-    "        encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n",
-    "        encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n",
-    "        model_inputs[\"encoding\"] = encoding\n",
-    "        return model_inputs\n",
-    "\n",
-    "    def _forward(self, model_inputs):\n",
-    "        # encoding is passed as a UserDict in the model_inputs dictionary\n",
-    "        # turn it back into a BatchEncoding\n",
-    "        encoding = BatchEncoding(model_inputs[\"encoding\"])\n",
-    "        if torch.cuda.is_available():\n",
-    "            encoding.to(\"cuda\")\n",
-    "            self.model.to(\"cuda\")\n",
-    "        # since we're doing inference, we don't need gradient computation\n",
-    "        with torch.no_grad():\n",
-    "            output = self.model(**encoding)\n",
-    "            return {\n",
-    "                \"logits\": output.logits,\n",
-    "                \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n",
-    "                \"raw_encoding\": model_inputs[\"raw_encoding\"],\n",
-    "                \"doc_dict\": model_inputs[\"doc_dict\"],\n",
-    "            }\n",
-    "\n",
-    "    def postprocess(self, all_outputs):\n",
-    "        \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n",
-    "        logits = all_outputs[\"logits\"]\n",
-    "        predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n",
-    "        output_df = self.extract_table(all_outputs)\n",
-    "        return logits, predictions, output_df\n",
-    "\n",
-    "    def extract_table(self, all_outputs):\n",
-    "        \"\"\"Extract a structured table from a set of inference predictions.\n",
-    "\n",
-    "        This function essentially works by stacking bounding boxes and predictions\n",
-    "        into a dataframe and going from left to right and top to bottom. Then, every\n",
-    "        every time a new subsidiary entity is encountered, it assigns a new group or\n",
-    "        \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n",
-    "        entities in between these subsidiary groups are assigned to a subsidiary row/group.\n",
-    "        Finally, this is all formatted into a dataframe with an ID column from the original\n",
-    "        filename and a basic cleaning function normalizes strings.\n",
-    "        \"\"\"\n",
-    "        # TODO: when model more mature, break this into sub functions to make it\n",
-    "        # clearer what's going on\n",
-    "        predictions = all_outputs[\"predictions\"]\n",
-    "        encoding = all_outputs[\"raw_encoding\"]\n",
-    "        doc_dict = all_outputs[\"doc_dict\"]\n",
-    "\n",
-    "        token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n",
-    "        predictions_tensor = torch.tensor(predictions)\n",
-    "        mode_predictions = get_flattened_mode_predictions(\n",
-    "            token_boxes_tensor, predictions_tensor\n",
-    "        )\n",
-    "        token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n",
-    "        predicted_labels = [\n",
-    "            self.model.config.id2label[pred] for pred in mode_predictions\n",
-    "        ]\n",
-    "        simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n",
-    "\n",
-    "        df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n",
-    "        df.loc[:, \"iob_pred\"] = predicted_labels\n",
-    "        df.loc[:, \"pred\"] = simple_preds\n",
-    "        invalid_mask = (\n",
-    "            (df[\"top_left_x\"] == 0)\n",
-    "            & (df[\"top_left_y\"] == 0)\n",
-    "            & (df[\"bottom_right_x\"] == 0)\n",
-    "            & (df[\"bottom_right_y\"] == 0)\n",
-    "        )\n",
-    "        df = df[~invalid_mask]\n",
-    "        # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n",
-    "        # subwords from the same word share the same bounding box coordinates\n",
-    "        # so we merge the original words onto our dataframe on bbox coordinates\n",
-    "        words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n",
-    "        words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n",
-    "        df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n",
-    "            subset=BBOX_COLS + [\"pred\", \"word\"]\n",
-    "        )\n",
-    "        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n",
-    "        # should always have a B entity label. Manually override labels so this is true.\n",
-    "        first_in_group_df = df[\n",
-    "            (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n",
-    "        ]\n",
-    "        first_in_group_df.loc[:, \"iob_pred\"] = (\n",
-    "            \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n",
-    "        )\n",
-    "        df.update(first_in_group_df)\n",
-    "        # filter for just words that were labeled with non \"other\" entities\n",
-    "        entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
-    "        entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n",
-    "        # words are labeled with IOB format which stands for inside, outside, beginning\n",
-    "        # merge B and I entities to form one entity group\n",
-    "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
-    "        entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
-    "        grouped_df = (\n",
-    "            entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n",
-    "            .apply(\" \".join)\n",
-    "            .reset_index()[[\"pred\", \"word\"]]\n",
-    "        )\n",
-    "        # assign a new row every time there's a new subsidiary\n",
-    "        grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n",
-    "        output_df = grouped_df.pivot_table(\n",
-    "            index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n",
-    "        ).reset_index()\n",
-    "        if output_df.empty:\n",
-    "            return output_df\n",
-    "        output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n",
-    "        return output_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f",
-   "metadata": {},
-   "source": [
-    "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from mozilla_sec_eia.models.sec10k.entities import (\n",
-    "    Ex21CompanyOwnership,\n",
-    "    Sec10kExtractionMetadata,\n",
-    ")\n",
-    "from mozilla_sec_eia.models.sec10k.ex_21.inference import clean_extracted_df\n",
-    "\n",
-    "# Construct model_uri from model_version\n",
-    "model_uri = f\"models:/layoutlm_extractor/{context.op_config['model_version']}\"\n",
-    "model_info = mlflow.models.get_model_info(model_uri)\n",
-    "\n",
-    "def _get_data(dataset):\n",
-    "    yield from dataset\n",
-    "\n",
-    "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n",
-    "    \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n",
-    "    def load_context(self, context):\n",
-    "        \"\"\"Load pretrained model.\"\"\"\n",
-    "        os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
-    "        self.model_components = mlflow.transformers.load_model(\n",
-    "            context.artifacts[\"layoutlm_extractor\"], return_type=\"components\"\n",
-    "        )\n",
-    "\n",
-    "    def predict(self, context, model_input: Dataset, params=None):\n",
-    "        \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n",
-    "        # TODO: figure out device argument\n",
-    "        pipe = pipeline(\n",
-    "            \"token-classification\",\n",
-    "            model=self.model_components[\"model\"],\n",
-    "            tokenizer=self.model_components[\"tokenizer\"],\n",
-    "            pipeline_class=LayoutLMInferencePipeline,\n",
-    "        )\n",
-    "\n",
-    "        logits = []\n",
-    "        predictions = []\n",
-    "        all_output_df = Ex21CompanyOwnership.example(size=0)\n",
-    "        extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n",
-    "        for logit, pred, output_df in pipe(_get_data(model_input)):\n",
-    "            logits.append(logit)\n",
-    "            predictions.append(pred)\n",
-    "            if not output_df.empty:\n",
-    "                filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n",
-    "                extraction_metadata.loc[filename, [\"success\"]] = True\n",
-    "            all_output_df = pd.concat([all_output_df, output_df])\n",
-    "        all_output_df.columns.name = None\n",
-    "        all_output_df = clean_extracted_df(all_output_df)\n",
-    "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n",
-    "        all_output_df = all_output_df.reset_index(drop=True)\n",
-    "        return extraction_metadata, all_output_df\n",
-    "\n",
-    "# Save model to local temp dir with artifacts, then reload for evaluation\n",
-    "with TemporaryDirectory() as tmp_dir:\n",
-    "    mlflow.pyfunc.save_model(\n",
-    "        path=tmp_dir,\n",
-    "        python_model=Ex21Extractor(),\n",
-    "        artifacts={\"model_components\": model_uri},\n",
-    "    )\n",
-    "    ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d",
-   "metadata": {},
-   "source": [
-    "### Model Evaluation\n",
-    "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47c19b41-131f-4059-8f42-931237565a20",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n",
-    "    \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n",
-    "    validation_df = validation_df.rename(\n",
-    "        columns={\n",
-    "            \"Filename\": \"id\",\n",
-    "            \"Subsidiary\": \"subsidiary\",\n",
-    "            \"Location of Incorporation\": \"loc\",\n",
-    "            \"Ownership Percentage\": \"own_per\",\n",
-    "        }\n",
-    "    )\n",
-    "    validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n",
-    "    validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n",
-    "    validation_df = clean_extracted_df(validation_df)\n",
-    "    return validation_df\n",
-    "\n",
-    "# Load labeled validation set\n",
-    "validation_set = clean_ex21_validation_set(\n",
-    "    validation_helpers.load_validation_data(\"ex21_labels.csv\")\n",
-    ")\n",
-    "\n",
-    "# Get filing metadata for filings in validation set\n",
-    "cloud_interface = GCSArchive()\n",
-    "filing_metadata = cloud_interface.get_metadata()\n",
-    "ex21_validation_filing_metadata = filing_metadata[\n",
-    "    filing_metadata.index.isin(validation_set[\"filename\"].unique())\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",
-   "metadata": {},
-   "source": [
-    "Next define methods evaluating model output, then run extraction and log in child run."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n",
-    "\n",
-    "\n",
-    "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n",
-    "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
-    "    shared_cols = validation_df.columns.intersection(computed_df.columns)\n",
-    "    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n",
-    "    n_equal = 0\n",
-    "    validation_filenames = validation_df[\"id\"].unique()\n",
-    "    n_files = len(validation_filenames)\n",
-    "    table_metrics_dict = {}\n",
-    "    jaccard_dict = {}\n",
-    "    incorrect_files = []\n",
-    "    # iterate through each file and check each extracted table\n",
-    "    for filename in validation_filenames:\n",
-    "        extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n",
-    "            drop=True\n",
-    "        )\n",
-    "        validation_table_df = validation_df[\n",
-    "            validation_df[\"id\"] == filename\n",
-    "        ].reset_index(drop=True)\n",
-    "        # check if the tables are exactly equal\n",
-    "        if extracted_table_df.equals(validation_table_df):\n",
-    "            # TODO: strip llc and other company strings before comparison\n",
-    "            n_equal += 1\n",
-    "        else:\n",
-    "            incorrect_files.append(filename)\n",
-    "        # compute precision and recall for each column\n",
-    "        table_metrics_dict[filename] = {}\n",
-    "        jaccard_dict[filename] = {}\n",
-    "        for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
-    "            table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n",
-    "                extracted_table_df, validation_table_df, value_col=col\n",
-    "            )\n",
-    "            table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n",
-    "                \"precision\"\n",
-    "            ]\n",
-    "            table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n",
-    "            # get the jaccard similarity between columns\n",
-    "            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n",
-    "                computed_df=extracted_table_df,\n",
-    "                validation_df=validation_table_df,\n",
-    "                value_col=col,\n",
-    "            )\n",
-    "\n",
-    "    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n",
-    "    prec_recall_df = pd.DataFrame.from_dict(\n",
-    "        table_metrics_dict, orient=\"index\"\n",
-    "    ).reset_index()\n",
-    "\n",
-    "    return (\n",
-    "        jaccard_df,\n",
-    "        prec_recall_df,\n",
-    "        pd.DataFrame({\"filename\": incorrect_files}),\n",
-    "        {\n",
-    "            \"table_accuracy\": n_equal / n_files,\n",
-    "            \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n",
-    "            \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n",
-    "            \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n",
-    "            \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n",
-    "            \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
-    "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
-    "        },\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
-    "    failed_metadata, dataset = create_inference_dataset(\n",
-    "        filings=ex21_validation_filing_metadata,\n",
-    "        cloud_interface=cloud_interface,\n",
-    "        has_labels=True,\n",
-    "    )\n",
-    "    metadata, extracted = ex21_extraction_model.predict(dataset)\n",
-    "    metadata = pd.concat([failed_metadata, metadata])\n",
-    "\n",
-    "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n",
-    "    mlflow.log_metrics(metrics)\n",
-    "    mlflow.pyfunc.log_model(\"exhibit21_extractor\", python_model=ex21_extraction_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45a5b13a-2276-4fb2-80dd-76e3f1184bea",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index b685063..1a5ec96 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -173,7 +173,7 @@ def get_metadata(self, year_quarter: str | None = None) -> pd.DataFrame:
         """Return dataframe of filing metadata."""
         selection = None
         if year_quarter is not None:
-            selection = ["year_quarter", "==", year_quarter]
+            selection = [("year_quarter", "==", year_quarter)]
 
         return pd.read_parquet(
             self.outputs_bucket_path / "sec10k_filing_metadata", filters=selection
diff --git a/tests/unit/models/sec10k/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py
index ebfe63d..0e23fed 100644
--- a/tests/unit/models/sec10k/ex21_model_test.py
+++ b/tests/unit/models/sec10k/ex21_model_test.py
@@ -2,8 +2,10 @@
 
 import torch
 
-from mozilla_sec_eia.models.sec10k.ex_21.inference import get_flattened_mode_predictions
-from mozilla_sec_eia.models.sec10k.ex_21.train_extractor import LABELS
+from mozilla_sec_eia.models.sec10k.ex_21.inference import (
+    LABELS,
+    get_flattened_mode_predictions,
+)
 from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions
 
 

From 37edd50bc1030082a24551113ac4fd18910d1dd3 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 11:24:33 -0400
Subject: [PATCH 085/161] Split dataset loading into separate assets

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |   16 +-
 .../models/sec10k/ex_21/__init__.py           |   12 -
 .../models/sec10k/ex_21/data.py               |    1 -
 .../models/sec10k/ex_21/data/__init__.py      |   72 ++
 .../models/sec10k/ex_21/data/common.py        |  203 ++++
 .../models/sec10k/ex_21/data/inference.py     |  119 ++
 .../training.py}                              |  101 +-
 .../sec10k/ex_21/ex21_validation_helpers.py   |   35 +
 .../models/sec10k/ex_21/inference.py          |  197 +---
 .../notebooks/exhibit21_extractor.ipynb       |  699 ++++-------
 .../train_exhibit21_extraction.ipynb          | 1045 -----------------
 .../models/sec10k/utils/pdf.py                |   15 +
 tests/unit/models/sec10k/ex21_model_test.py   |    8 +-
 13 files changed, 705 insertions(+), 1818 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
 create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
 rename src/mozilla_sec_eia/models/sec10k/ex_21/{create_labeled_dataset.py => data/training.py} (57%)
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index b482aec..63097e9 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -1,6 +1,7 @@
 """Implement models to extract data from SEC10k filings."""
 
 from dagster import (
+    AssetIn,
     Config,
     Definitions,
     define_asset_job,
@@ -28,6 +29,7 @@
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
+ex21_training_data_assets = load_assets_from_modules([ex_21.data])
 shared_assets = load_assets_from_modules([extract])
 
 basic_10k_production_job = model_jobs.create_production_model_job(
@@ -59,16 +61,26 @@ class TrainConfig(Config):
     name="exhibit21_extractor",
     notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"),
     config_schema=TrainConfig.to_config_schema(),
+    ins={
+        "ex21_training_data": AssetIn(),
+        "ex21_validation_set": AssetIn(),
+        "ex21_failed_parsing_metadata": AssetIn(),
+        "ex21_inference_dataset": AssetIn(),
+    },
 )
 ex21_training_job = define_asset_job(
     "ex21_training",
-    selection=[exhibit21_extractor],
+    selection=[exhibit21_extractor] + ex21_training_data_assets,
     executor_def=in_process_executor,
 )
 
 
 defs = Definitions(
-    assets=basic_10k_assets + ex21_assets + shared_assets + [exhibit21_extractor],
+    assets=basic_10k_assets
+    + ex21_assets
+    + shared_assets
+    + [exhibit21_extractor]
+    + ex21_training_data_assets,
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 60e7c97..574074d 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -2,24 +2,15 @@
 
 import logging
 
-import mlflow
 import pandas as pd
 from dagster import (
-    AssetIn,
     AssetOut,
     In,
     Out,
-    asset,
     graph_multi_asset,
-    multi_asset,
     op,
 )
 
-from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource
-from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (
-    clean_ex21_validation_set,
-)
-
 from ..entities import (
     Ex21CompanyOwnership,
     Sec10kExtractionMetadata,
@@ -27,7 +18,6 @@
     sec10k_extract_metadata_type,
 )
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
-from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename
 from .inference import extract_filings
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
@@ -84,12 +74,10 @@ def collect_extracted_chunks(
             io_manager_key="pandas_parquet_io_manager"
         ),
     },
-    ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")},
     partitions_def=year_quarter_partitions,
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
-    layoutlm,
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py
deleted file mode 100644
index 4e331c8..0000000
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Define methods and assets for handling datasets used by."""
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
new file mode 100644
index 0000000..da5525f
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -0,0 +1,72 @@
+"""Tools for constructing datasets used by exhibit 21 extraction model."""
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pandas as pd
+from dagster import AssetOut, Config, asset, multi_asset
+
+from mozilla_sec_eia.library import validation_helpers
+
+from ...entities import ex21_extract_type, sec10k_extract_metadata_type
+from ...utils.cloud import GCSArchive
+from ..ex21_validation_helpers import clean_ex21_validation_set
+from .inference import create_inference_dataset
+from .training import format_as_ner_annotations
+
+
+class Ex21TrainingConfig(Config):
+    """Configure asset to produce ex21 training data."""
+
+    training_set: str = "labeledv0.2"
+
+
+@asset
+def ex21_training_data(config: Ex21TrainingConfig):
+    """Construct training dataset for ex 21 extraction."""
+    with TemporaryDirectory() as temp_dir:
+        ner_annotations = format_as_ner_annotations(
+            labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons",
+            pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs",
+            gcs_folder_name=config.training_set,
+        )
+    return ner_annotations
+
+
+@asset(dagster_type=ex21_extract_type)
+def ex21_validation_set() -> pd.DataFrame:
+    """Return dataframe containing basic 10k validation data."""
+    return clean_ex21_validation_set(
+        validation_helpers.load_validation_data("ex21_labels.csv")
+    )
+
+
+@asset
+def ex21_validation_filing_metadata(
+    cloud_interface: GCSArchive,
+    ex21_validation_set: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[
+        filing_metadata.index.isin(ex21_validation_set["filename"].unique())
+    ]
+
+
+@multi_asset(
+    outs={
+        "ex21_failed_parsing_metadata": AssetOut(
+            dagster_type=sec10k_extract_metadata_type,
+        ),
+        "ex21_inference_dataset": AssetOut(),
+    },
+)
+def ex21_inference_dataset(
+    cloud_interface: GCSArchive,
+    ex21_validation_filing_metadata: pd.DataFrame,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Construct inference dataset for ex 21 extraction."""
+    return create_inference_dataset(
+        filing_metadata=ex21_validation_filing_metadata,
+        cloud_interface=cloud_interface,
+    )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
new file mode 100644
index 0000000..5f79109
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
@@ -0,0 +1,203 @@
+"""Implement methods used to construct both inference and training sets."""
+
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+from mozilla_sec_eia.library import validation_helpers
+
+from ...utils.pdf import get_pdf_data_from_path
+
+LABEL_PRIORITY = [
+    "I-Subsidiary",
+    "I-Loc",
+    "I-Own_Per",
+    "B-Subsidiary",
+    "B-Loc",
+    "B-Own_Per",
+    "O",
+]
+LABELS = [
+    "O",
+    "B-Subsidiary",
+    "I-Subsidiary",
+    "B-Loc",
+    "I-Loc",
+    "B-Own_Per",
+    "I-Own_Per",
+]
+
+BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]
+
+BBOX_COLS_PDF = [
+    "top_left_x_pdf",
+    "top_left_y_pdf",
+    "bottom_right_x_pdf",
+    "bottom_right_y_pdf",
+]
+
+
+def normalize_bboxes(txt_df, pg_meta_df):
+    """Normalize bboxes between 0 and 1000."""
+    txt_df["top_left_x_pdf"] = (
+        txt_df["top_left_x_pdf"] / pg_meta_df.width_pdf_coord.iloc[0] * 1000
+    )
+    txt_df["top_left_y_pdf"] = (
+        txt_df["top_left_y_pdf"] / pg_meta_df.height_pdf_coord.iloc[0] * 1000
+    )
+    txt_df["bottom_right_x_pdf"] = (
+        txt_df["bottom_right_x_pdf"] / pg_meta_df.width_pdf_coord.iloc[0] * 1000
+    )
+    txt_df["bottom_right_y_pdf"] = (
+        txt_df["bottom_right_y_pdf"] / pg_meta_df.height_pdf_coord.iloc[0] * 1000
+    )
+    return txt_df
+
+
+def unnormalize_box(bbox, width, height):
+    """Unnormalize bboxes for drawing onto an image."""
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]
+
+
+def get_id_label_conversions(labels):
+    """Return dicts mapping ids to labels and labels to ids."""
+    id2label = dict(enumerate(labels))
+    label2id = {v: k for k, v in enumerate(labels)}
+    return id2label, label2id
+
+
+def iob_to_label(label):
+    """Convert an IOB entity label to a standard string label.
+
+    i.e. 'B-Subsidiary' becomes 'Subsidiary'.
+    """
+    label = label[2:]
+    if not label:
+        return "other"
+    return label
+
+
+def _is_cik_in_training_data(labeled_json_filename, tracking_df):
+    # TODO: for now CIK is stored as an int, update when fixed
+    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
+    return cik in tracking_df.CIK.unique()
+
+
+def format_label_studio_output(
+    labeled_json_dir: Path,
+    pdfs_dir: Path,
+) -> pd.DataFrame:
+    """Format Label Studio output JSONs into dataframe."""
+    labeled_df = pd.DataFrame()
+    tracking_df = validation_helpers.load_training_data("ex21_labels.csv")
+
+    for json_filename in os.listdir(labeled_json_dir):
+        if not json_filename[0].isdigit() or json_filename.endswith(".json"):
+            continue
+        json_file_path = labeled_json_dir / json_filename
+        with Path.open(json_file_path) as j:
+            doc_dict = json.loads(j.read())
+            filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
+            # check if old local naming schema is being used
+            if len(filename.split("-")) == 6:
+                filename = "-".join(filename.split("-")[2:])
+            if not _is_cik_in_training_data(filename, tracking_df=tracking_df):
+                continue
+            pdf_filename = filename + ".pdf"
+            src_path = pdfs_dir / pdf_filename
+            extracted, pg = get_pdf_data_from_path(src_path)
+            txt = extracted["pdf_text"]
+            pg_meta = extracted["page"]
+            # normalize bboxes between 0 and 1000 for Hugging Face
+            txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
+            # parse the output dictionary of labeled bounding boxes from Label Studio
+            doc_df = pd.DataFrame()
+            for item in doc_dict["result"]:
+                value = item["value"]
+                # sometimes Label Studio will fill in an empty list as a label
+                # when there is really no label
+                # TODO: do this without dict comprehension?
+                if ("labels" in value) and value["labels"] == []:
+                    value = {k: v for k, v in value.items() if k != "labels"}
+                ind = int(item["id"].split("_")[-1])
+                doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])
+            # combine the bounding boxes for each word
+            doc_df = doc_df.groupby(level=0).first()
+            txt.loc[:, "id"] = filename
+            # TODO: probably want to filter out these empty Ex. 21 docs
+            # the doc might not have any labels in it if it was an empty Ex. 21
+            if "labels" not in doc_df:
+                doc_df.loc[:, "labels"] = pd.Series()
+            output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)
+            labeled_df = pd.concat([labeled_df, output_df])
+
+    # fill in unlabeled words and clean up labeled dataframe
+    labeled_df["labels"] = labeled_df["labels"].fillna("O")
+    labeled_df = labeled_df.rename(columns={"labels": "ner_tag"})
+    non_id_columns = [col for col in labeled_df.columns if col != "id"]
+    labeled_df = labeled_df.loc[:, ["id"] + non_id_columns]
+
+    # TODO: add in sanity checks on labeled_df bounding boxes to make sure
+    # that no value is above 1000 or below 0
+
+    return labeled_df
+
+
+def _sort_by_label_priority(target_array):
+    _, label2id = get_id_label_conversions(LABELS)
+    id_priority = [label2id[label] for label in LABEL_PRIORITY]
+    # Create a priority map from the label priority
+    priority_map = {val: idx for idx, val in enumerate(id_priority)}
+    # Sort the target array based on the priority map
+    sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float("inf")))
+    return sorted_array
+
+
+def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):
+    """Get the mode prediction for each box in an Ex. 21.
+
+    When handling multi page documents LayoutLM uses a sliding 'frame'
+    with some overlap between frames. The overlap creates multiple
+    predictions for the same bounding boxes. Thus it's necessary to find
+    the mode of all the predictions for a bounding box and use that as the
+    single prediction for each box. If there are multiple mode
+    predictions for a bounding box, then ties are broken by setting
+    a priority for the labels (LABEL_PRIORITY) and choosing the highest priority
+    label.
+    """
+    # Flatten the tensors
+    flat_token_boxes = token_boxes_tensor.view(-1, 4)
+    flat_predictions = predictions_tensor.view(-1)
+
+    boxes = flat_token_boxes.numpy()
+    predictions = flat_predictions.numpy()
+
+    # Find unique boxes and indices
+    unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)
+
+    # Compute the mode for each unique bounding box
+    # for each unique box in boxes, create a list with all predictions for that box
+    # get the indices in predictions where the corresponding index in boxes is
+    unique_box_predictions = [
+        predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))
+    ]
+    pred_counts = [np.bincount(arr) for arr in unique_box_predictions]
+    # Compute the mode of predictions for each group
+    # break ties by taking into account LABEL_PRIORITY
+    modes = np.array(
+        [
+            _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]
+            for arr in pred_counts
+        ]
+    )
+    flattened_modes = modes[inverse_indices]
+
+    return flattened_modes
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
new file mode 100644
index 0000000..56a7d9b
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -0,0 +1,119 @@
+"""Create inference dataset for exhibit 21 extraction model."""
+
+import logging
+import os
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+
+from ...utils.cloud import GCSArchive
+from ...utils.pdf import get_image_dict, get_pdf_data_from_path
+from .common import BBOX_COLS_PDF, format_label_studio_output, normalize_bboxes
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
+
+def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
+    """Read and format PDFs into a dataframe (without labels)."""
+    inference_df = pd.DataFrame()
+    for pdf_filename in os.listdir(pdfs_dir):
+        if not pdf_filename.endswith(".pdf"):
+            continue
+        src_path = pdfs_dir / pdf_filename
+        filename = Path(pdf_filename).stem
+        extracted, pg = get_pdf_data_from_path(src_path)
+        txt = extracted["pdf_text"]
+        pg_meta = extracted["page"]
+        # normalize bboxes between 0 and 1000 for Hugging Face
+        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
+        txt.loc[:, "id"] = filename
+        inference_df = pd.concat([inference_df, txt])
+    return inference_df
+
+
+def _cache_pdfs(
+    filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path
+) -> pd.DataFrame:
+    """Iterate filings and cache pdfs."""
+    extraction_metadata = pd.DataFrame(
+        {
+            "filename": pd.Series(dtype=str),
+            "success": pd.Series(dtype=bool),
+            "notes": pd.Series(dtype=str),
+        }
+    ).set_index("filename")
+
+    for filing in cloud_interface.iterate_filings(filings):
+        pdf_path = cloud_interface.get_local_filename(
+            cache_directory=pdf_dir, filing=filing, extension=".pdf"
+        )
+
+        # Some filings are poorly formatted and fail in `save_as_pdf`
+        # We want a record of these but don't want to stop run
+        try:
+            with pdf_path.open("wb") as f:
+                filing.ex_21.save_as_pdf(f)
+        except Exception as e:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = str(e)
+
+        # Some pdfs are empty. Check for these and remove from dir
+        if pdf_path.stat().st_size == 0:
+            extraction_metadata.loc[filing.filename, ["success"]] = False
+            extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty"
+            pdf_path.unlink()
+
+    return extraction_metadata
+
+
+def create_inference_dataset(
+    filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Create a Hugging Face Dataset from PDFs for inference."""
+    filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()]
+
+    # Parse PDFS
+    with (
+        tempfile.TemporaryDirectory() as pdfs_dir,
+        tempfile.TemporaryDirectory() as labeled_json_dir,
+    ):
+        pdfs_dir = Path(pdfs_dir)
+        labeled_json_dir = Path(labeled_json_dir)
+
+        extraction_metadata = _cache_pdfs(
+            filings_with_ex21,
+            cloud_interface=cloud_interface,
+            pdf_dir=pdfs_dir,
+        )
+        if has_labels:
+            inference_df = format_label_studio_output(
+                labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir
+            )
+        else:
+            inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
+        image_dict = get_image_dict(pdfs_dir)
+
+    annotations = []
+    for filename, image in image_dict.items():
+        annotation = {
+            "id": filename,
+            "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename],
+            "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF]
+            .to_numpy()
+            .tolist(),
+            "image": image.tobytes(),
+            "mode": image.mode,
+            "width": image.size[0],
+            "height": image.size[1],
+        }
+        if has_labels:
+            annotation["ner_tags"] = (
+                inference_df.groupby("id")["ner_tag"].apply(list).loc[filename]
+            )
+        annotations.append(annotation)
+
+    return extraction_metadata, pd.DataFrame(annotations)
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py
similarity index 57%
rename from src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
rename to src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py
index 8530705..e354a66 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py
@@ -1,4 +1,4 @@
-"""Module handling Label Studio inputs and outputs and preparing a dataset for fine-tuning."""
+"""Create training dataset for layoutlm extraction."""
 
 import json
 import logging
@@ -7,26 +7,19 @@
 
 import pandas as pd
 
-from ..utils.cloud import GCSArchive
-from ..utils.layoutlm import normalize_bboxes
-from ..utils.pdf import (
+from ...utils.cloud import GCSArchive
+from ...utils.pdf import (
+    get_image_dict,
     get_pdf_data_from_path,
     pil_to_cv2,
     render_page,
 )
+from .common import BBOX_COLS_PDF, format_label_studio_output
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.resolve()
 
 
-BBOX_COLS_PDF = [
-    "top_left_x_pdf",
-    "top_left_y_pdf",
-    "bottom_right_x_pdf",
-    "bottom_right_y_pdf",
-]
-
-
 def create_inputs_for_label_studio(
     model_version: str = "v1.0",
     pdfs_dir: Path = ROOT_DIR / "sec10k_filings/pdfs",
@@ -135,89 +128,9 @@ def get_bbox_dicts(
     return [box_dict, word_dict]
 
 
-def _is_cik_in_training_data(labeled_json_filename, tracking_df):
-    # TODO: for now CIK is stored as an int, update when fixed
-    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
-    return cik in tracking_df.CIK.unique()
-
-
-def format_label_studio_output(
-    labeled_json_dir=ROOT_DIR / "sec10k_filings/labeled_jsons",
-    pdfs_dir=ROOT_DIR / "sec10k_filings/pdfs",
-) -> pd.DataFrame:
-    """Format Label Studio output JSONs into dataframe."""
-    labeled_df = pd.DataFrame()
-    # TODO: make this path stuff less janky?
-    tracking_df = pd.read_csv(ROOT_DIR / "labeled_data_tracking.csv")
-    for json_filename in os.listdir(labeled_json_dir):
-        if not json_filename[0].isdigit() or json_filename.endswith(".json"):
-            continue
-        json_file_path = labeled_json_dir / json_filename
-        with Path.open(json_file_path) as j:
-            doc_dict = json.loads(j.read())
-            filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
-            # check if old local naming schema is being used
-            if len(filename.split("-")) == 6:
-                filename = "-".join(filename.split("-")[2:])
-            if not _is_cik_in_training_data(filename, tracking_df=tracking_df):
-                continue
-            pdf_filename = filename + ".pdf"
-            src_path = pdfs_dir / pdf_filename
-            extracted, pg = get_pdf_data_from_path(src_path)
-            txt = extracted["pdf_text"]
-            pg_meta = extracted["page"]
-            # normalize bboxes between 0 and 1000 for Hugging Face
-            txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
-            # parse the output dictionary of labeled bounding boxes from Label Studio
-            doc_df = pd.DataFrame()
-            for item in doc_dict["result"]:
-                value = item["value"]
-                # sometimes Label Studio will fill in an empty list as a label
-                # when there is really no label
-                # TODO: do this without dict comprehension?
-                if ("labels" in value) and value["labels"] == []:
-                    value = {k: v for k, v in value.items() if k != "labels"}
-                ind = int(item["id"].split("_")[-1])
-                doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])
-            # combine the bounding boxes for each word
-            doc_df = doc_df.groupby(level=0).first()
-            txt.loc[:, "id"] = filename
-            # TODO: probably want to filter out these empty Ex. 21 docs
-            # the doc might not have any labels in it if it was an empty Ex. 21
-            if "labels" not in doc_df:
-                doc_df.loc[:, "labels"] = pd.Series()
-            output_df = pd.concat([txt, doc_df[["labels"]]], axis=1)
-            labeled_df = pd.concat([labeled_df, output_df])
-
-    # fill in unlabeled words and clean up labeled dataframe
-    labeled_df["labels"] = labeled_df["labels"].fillna("O")
-    labeled_df = labeled_df.rename(columns={"labels": "ner_tag"})
-    non_id_columns = [col for col in labeled_df.columns if col != "id"]
-    labeled_df = labeled_df.loc[:, ["id"] + non_id_columns]
-
-    # TODO: add in sanity checks on labeled_df bounding boxes to make sure
-    # that no value is above 1000 or below 0
-
-    return labeled_df
-
-
-def get_image_dict(pdfs_dir):
-    """Create a dictionary with filenames and their Ex. 21 images."""
-    image_dict = {}
-    for pdf_filename in os.listdir(pdfs_dir):
-        if pdf_filename.split(".")[-1] != "pdf":
-            continue
-        pdf_file_path = pdfs_dir / pdf_filename
-        _, pg = get_pdf_data_from_path(pdf_file_path)
-        full_pg_img = render_page(pg)
-        filename = pdf_filename.split(".")[0]
-        image_dict[filename] = full_pg_img
-    return image_dict
-
-
 def format_as_ner_annotations(
-    labeled_json_path=ROOT_DIR / "sec10k_filings/labeled_jsons",
-    pdfs_path=ROOT_DIR / "sec10k_filings/pdfs",
+    labeled_json_path: Path,
+    pdfs_path: Path,
     gcs_folder_name: str = "labeled/",
 ) -> list[dict]:
     """Format a Label Studio output JSONs as NER annotations.
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
index d6eebea..fca7168 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
@@ -1,10 +1,45 @@
 """Helper functions specific to Exhibit 21 model validation."""
 
+import numpy as np
 import pandas as pd
 
 from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename
 
 
+def clean_extracted_df(extracted_df):
+    """Perform basic cleaning on a dataframe extracted from an Ex. 21."""
+    if extracted_df.empty:
+        return extracted_df
+    if "row" in extracted_df.columns:
+        extracted_df = extracted_df.drop(columns=["row"])
+    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.strip().str.lower()
+    # strip special chars from the start and end of the string
+    extracted_df["subsidiary"] = extracted_df["subsidiary"].str.replace(
+        r"^[^\w&\s]+|[^\w&\s]+$", "", regex=True
+    )
+    if "loc" in extracted_df.columns:
+        extracted_df["loc"] = extracted_df["loc"].str.strip().str.lower()
+        extracted_df["loc"] = extracted_df["loc"].str.replace(
+            r"[^a-zA-Z&,\s]", "", regex=True
+        )
+    if "own_per" in extracted_df.columns:
+        # remove special chars and letters
+        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
+            r"[^\d.]", "", regex=True
+        )
+        # Find values with multiple decimal points
+        extracted_df["own_per"] = extracted_df["own_per"].str.replace(
+            r"(\d*\.\d+)\..*", r"\1", regex=True
+        )
+        extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan)
+        extracted_df["own_per"] = extracted_df["own_per"].astype(
+            "float64", errors="ignore"
+        )
+    # drop rows that have a null subsidiary value
+    extracted_df = extracted_df.dropna(subset="subsidiary")
+    return extracted_df
+
+
 def clean_ex21_validation_set(validation_df: pd.DataFrame):
     """Clean Ex. 21 validation data to match extracted format."""
     validation_df = validation_df.rename(
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
index 5633e40..6f517a3 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
@@ -1,162 +1,18 @@
 """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model."""
 
 import logging
-import os
-import tempfile
 import traceback
-from pathlib import Path
 
-import numpy as np
 import pandas as pd
 from mlflow.pyfunc import PyFuncModel
 
 from ..entities import Ex21CompanyOwnership
 from ..utils.cloud import GCSArchive
-from ..utils.layoutlm import (
-    normalize_bboxes,
-)
-from ..utils.pdf import (
-    get_pdf_data_from_path,
-)
-from .create_labeled_dataset import (
-    BBOX_COLS_PDF,
-    format_label_studio_output,
-    get_image_dict,
-)
-
-# When handling multi page documents LayoutLM uses a sliding 'frame'
-# with some overlap between frames. The overlap creates multiple
-# predictions for the same bounding boxes. If there are multiple mode
-# predictions for a bounding box, then ties are broken by setting
-# a priority for the labels and choosing the highest priority label.
-LABEL_PRIORITY = [
-    "I-Subsidiary",
-    "I-Loc",
-    "I-Own_Per",
-    "B-Subsidiary",
-    "B-Loc",
-    "B-Own_Per",
-    "O",
-]
-
-LABELS = [
-    "O",
-    "B-Subsidiary",
-    "I-Subsidiary",
-    "B-Loc",
-    "I-Loc",
-    "B-Own_Per",
-    "I-Own_Per",
-]
-
-BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"]
-label2id = {v: k for k, v in enumerate(LABELS)}
+from .data.inference import create_inference_dataset
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
-    """Read and format PDFs into a dataframe (without labels)."""
-    inference_df = pd.DataFrame()
-    for pdf_filename in os.listdir(pdfs_dir):
-        if not pdf_filename.endswith(".pdf"):
-            continue
-        src_path = pdfs_dir / pdf_filename
-        filename = Path(pdf_filename).stem
-        extracted, pg = get_pdf_data_from_path(src_path)
-        txt = extracted["pdf_text"]
-        pg_meta = extracted["page"]
-        # normalize bboxes between 0 and 1000 for Hugging Face
-        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
-        txt.loc[:, "id"] = filename
-        inference_df = pd.concat([inference_df, txt])
-    return inference_df
-
-
-def _cache_pdfs(
-    filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path
-) -> pd.DataFrame:
-    """Iterate filings and cache pdfs."""
-    extraction_metadata = pd.DataFrame(
-        {
-            "filename": pd.Series(dtype=str),
-            "success": pd.Series(dtype=bool),
-            "notes": pd.Series(dtype=str),
-        }
-    ).set_index("filename")
-
-    for filing in cloud_interface.iterate_filings(filings):
-        pdf_path = cloud_interface.get_local_filename(
-            cache_directory=pdf_dir, filing=filing, extension=".pdf"
-        )
-
-        # Some filings are poorly formatted and fail in `save_as_pdf`
-        # We want a record of these but don't want to stop run
-        try:
-            with pdf_path.open("wb") as f:
-                filing.ex_21.save_as_pdf(f)
-        except Exception as e:
-            extraction_metadata.loc[filing.filename, ["success"]] = False
-            extraction_metadata.loc[filing.filename, ["note"]] = str(e)
-
-        # Some pdfs are empty. Check for these and remove from dir
-        if pdf_path.stat().st_size == 0:
-            extraction_metadata.loc[filing.filename, ["success"]] = False
-            extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty"
-            pdf_path.unlink()
-
-    return extraction_metadata
-
-
-def create_inference_dataset(
-    filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Create a Hugging Face Dataset from PDFs for inference."""
-    filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()]
-
-    # Parse PDFS
-    with (
-        tempfile.TemporaryDirectory() as pdfs_dir,
-        tempfile.TemporaryDirectory() as labeled_json_dir,
-    ):
-        pdfs_dir = Path(pdfs_dir)
-        labeled_json_dir = Path(labeled_json_dir)
-
-        extraction_metadata = _cache_pdfs(
-            filings_with_ex21,
-            cloud_interface=cloud_interface,
-            pdf_dir=pdfs_dir,
-        )
-        if has_labels:
-            inference_df = format_label_studio_output(
-                labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir
-            )
-        else:
-            inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
-        image_dict = get_image_dict(pdfs_dir)
-
-    annotations = []
-    for filename, image in image_dict.items():
-        annotation = {
-            "id": filename,
-            "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename],
-            "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF]
-            .to_numpy()
-            .tolist(),
-            "image": image.tobytes(),
-            "mode": image.mode,
-            "width": image.size[0],
-            "height": image.size[1],
-        }
-        if has_labels:
-            annotation["ner_tags"] = (
-                inference_df.groupby("id")["ner_tag"].apply(list).loc[filename]
-            )
-        annotations.append(annotation)
-
-    return extraction_metadata, pd.DataFrame(annotations)
-
-
 def extract_filings(
     filings: pd.DataFrame,
     layoutlm: PyFuncModel,
@@ -182,54 +38,3 @@ def extract_filings(
         ).set_index("filename")
         extracted = Ex21CompanyOwnership.example(size=0)
     return metadata, extracted
-
-
-def _sort_by_label_priority(target_array):
-    id_priority = [label2id[label] for label in LABEL_PRIORITY]
-    # Create a priority map from the label priority
-    priority_map = {val: idx for idx, val in enumerate(id_priority)}
-    # Sort the target array based on the priority map
-    sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float("inf")))
-    return sorted_array
-
-
-def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):
-    """Get the mode prediction for each box in an Ex. 21.
-
-    When handling multi page documents LayoutLM uses a sliding 'frame'
-    with some overlap between frames. The overlap creates multiple
-    predictions for the same bounding boxes. Thus it's necessary to find
-    the mode of all the predictions for a bounding box and use that as the
-    single prediction for each box. If there are multiple mode
-    predictions for a bounding box, then ties are broken by setting
-    a priority for the labels (LABEL_PRIORITY) and choosing the highest priority
-    label.
-    """
-    # Flatten the tensors
-    flat_token_boxes = token_boxes_tensor.view(-1, 4)
-    flat_predictions = predictions_tensor.view(-1)
-
-    boxes = flat_token_boxes.numpy()
-    predictions = flat_predictions.numpy()
-
-    # Find unique boxes and indices
-    unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)
-
-    # Compute the mode for each unique bounding box
-    # for each unique box in boxes, create a list with all predictions for that box
-    # get the indices in predictions where the corresponding index in boxes is
-    unique_box_predictions = [
-        predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))
-    ]
-    pred_counts = [np.bincount(arr) for arr in unique_box_predictions]
-    # Compute the mode of predictions for each group
-    # break ties by taking into account LABEL_PRIORITY
-    modes = np.array(
-        [
-            _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]
-            for arr in pred_counts
-        ]
-    )
-    flattened_modes = modes[inverse_indices]
-
-    return flattened_modes
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 4efc905..c03cc4e 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -28,8 +28,7 @@
     "import dagstermill\n",
     "\n",
     "context = dagstermill.get_context(op_config={\n",
-    "    \"uri\": \"runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor\",\n",
-    "    \"training_set\": \"labeledv0.2\",\n",
+    "    \"uri\": None,\n",
     "})"
    ]
   },
@@ -50,17 +49,8 @@
     "tags": []
    },
    "source": [
-    "### Setup training/test sets"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b",
-   "metadata": {},
-   "source": [
-    "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n",
-    "\n",
-    "First define several helper functions to do the conversion."
+    "### Define training metrics\n",
+    "The method `compute_metrics` will be used to score the model. It computes precision, recall, f1 score, and accuracy on bounding box labels output by `layoutlm`."
    ]
   },
   {
@@ -72,205 +62,15 @@
    },
    "outputs": [],
    "source": [
-    "import json\n",
     "import os\n",
-    "from pathlib import Path\n",
     "from tempfile import TemporaryDirectory\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
     "from mozilla_sec_eia.library import validation_helpers\n",
-    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n",
-    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n",
-    "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n",
-    "    get_pdf_data_from_path,\n",
-    "    render_page,\n",
-    ")\n",
-    "\n",
-    "# Set some constants\n",
-    "LABELS = [\n",
-    "    \"O\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"I-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"I-Own_Per\",\n",
-    "]\n",
-    "LABEL_PRIORITY = [\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"I-Loc\",\n",
-    "    \"I-Own_Per\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"O\",\n",
-    "]\n",
-    "\n",
-    "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n",
-    "BBOX_COLS_PDF = [\n",
-    "    \"top_left_x_pdf\",\n",
-    "    \"top_left_y_pdf\",\n",
-    "    \"bottom_right_x_pdf\",\n",
-    "    \"bottom_right_y_pdf\",\n",
-    "]\n",
-    "\n",
-    "# Map back and forth between id's and labels\n",
-    "id2label = dict(enumerate(LABELS))\n",
-    "label2id = {v: k for k, v in enumerate(LABELS)}\n",
-    "\n",
-    "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n",
-    "    # TODO: for now CIK is stored as an int, update when fixed\n",
-    "    cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n",
-    "    return cik in tracking_df.CIK.unique()\n",
-    "\n",
-    "\n",
-    "def format_label_studio_output(\n",
-    "    labeled_json_dir: Path,\n",
-    "    pdfs_dir: Path,\n",
-    ") -> pd.DataFrame:\n",
-    "    \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n",
-    "    labeled_df = pd.DataFrame()\n",
-    "    # TODO: make this path stuff less janky?\n",
-    "    tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n",
-    "    for json_filename in os.listdir(labeled_json_dir):\n",
-    "        if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n",
-    "            continue\n",
-    "        json_file_path = labeled_json_dir / json_filename\n",
-    "        with Path.open(json_file_path) as j:\n",
-    "            doc_dict = json.loads(j.read())\n",
-    "\n",
-    "        filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n",
-    "        # check if old local naming schema is being used\n",
-    "        if len(filename.split(\"-\")) == 6:\n",
-    "            filename = \"-\".join(filename.split(\"-\")[2:])\n",
-    "        if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n",
-    "            continue\n",
-    "\n",
-    "        pdf_filename = filename + \".pdf\"\n",
-    "        src_path = pdfs_dir / pdf_filename\n",
-    "        extracted, pg = get_pdf_data_from_path(src_path)\n",
-    "        txt = extracted[\"pdf_text\"]\n",
-    "        pg_meta = extracted[\"page\"]\n",
-    "        # normalize bboxes between 0 and 1000 for Hugging Face\n",
-    "        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n",
-    "        # parse the output dictionary of labeled bounding boxes from Label Studio\n",
-    "        doc_df = pd.DataFrame()\n",
-    "        for item in doc_dict[\"result\"]:\n",
-    "            value = item[\"value\"]\n",
-    "            # sometimes Label Studio will fill in an empty list as a label\n",
-    "            # when there is really no label\n",
-    "            # TODO: do this without dict comprehension?\n",
-    "            if (\"labels\" in value) and value[\"labels\"] == []:\n",
-    "                value = {k: v for k, v in value.items() if k != \"labels\"}\n",
-    "            ind = int(item[\"id\"].split(\"_\")[-1])\n",
-    "            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n",
-    "\n",
-    "        # combine the bounding boxes for each word\n",
-    "        doc_df = doc_df.groupby(level=0).first()\n",
-    "        txt.loc[:, \"id\"] = filename\n",
-    "        # TODO: probably want to filter out these empty Ex. 21 docs\n",
-    "        # the doc might not have any labels in it if it was an empty Ex. 21\n",
-    "        if \"labels\" not in doc_df:\n",
-    "            doc_df.loc[:, \"labels\"] = pd.Series()\n",
-    "\n",
-    "        output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n",
-    "        labeled_df = pd.concat([labeled_df, output_df])\n",
-    "\n",
-    "    # fill in unlabeled words and clean up labeled dataframe\n",
-    "    labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n",
-    "    labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n",
-    "    non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n",
-    "    labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n",
-    "\n",
-    "    # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n",
-    "    # that no value is above 1000 or below 0\n",
-    "\n",
-    "    return labeled_df\n",
-    "\n",
-    "\n",
-    "def get_image_dict(pdfs_dir):\n",
-    "    \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n",
-    "    image_dict = {}\n",
-    "    for pdf_filename in os.listdir(pdfs_dir):\n",
-    "        if pdf_filename.split(\".\")[-1] != \"pdf\":\n",
-    "            continue\n",
-    "        pdf_file_path = pdfs_dir / pdf_filename\n",
-    "        _, pg = get_pdf_data_from_path(pdf_file_path)\n",
-    "        full_pg_img = render_page(pg)\n",
-    "        filename = pdf_filename.split(\".\")[0]\n",
-    "        image_dict[filename] = full_pg_img\n",
-    "    return image_dict\n",
-    "\n",
-    "\n",
-    "def format_as_ner_annotations(\n",
-    "    labeled_json_path: Path,\n",
-    "    pdfs_path: Path,\n",
-    "    gcs_folder_name: Path,\n",
-    ") -> list[dict]:\n",
-    "    \"\"\"Format a Label Studio output JSONs as NER annotations.\n",
-    "\n",
-    "    Formats the dataframe as named entity recognition annotations.\n",
-    "    # TODO: say more about this format\n",
-    "\n",
-    "    Returns:\n",
-    "        ner_annotations: a list of dicts, with one dict for each doc.\n",
-    "    \"\"\"\n",
-    "    GCSArchive().cache_training_data(\n",
-    "        json_cache_path=labeled_json_path,\n",
-    "        pdf_cache_path=pdfs_path,\n",
-    "        gcs_folder_name=gcs_folder_name\n",
-    "    )\n",
-    "\n",
-    "    labeled_df = format_label_studio_output(\n",
-    "        labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n",
-    "    )\n",
-    "    # convert dataframe/dictionary into NER format\n",
-    "    # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n",
-    "    # complete dataset is a list of dicts, with one dict for each doc\n",
-    "    doc_filenames = labeled_df[\"id\"].unique()\n",
-    "    image_dict = get_image_dict(pdfs_dir=pdfs_path)\n",
-    "    ner_annotations = []\n",
-    "    for filename in doc_filenames:\n",
-    "        annotation = {\n",
-    "            \"id\": filename,\n",
-    "            \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n",
-    "            \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n",
-    "            \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n",
-    "            .to_numpy()\n",
-    "            .tolist(),\n",
-    "            \"image\": image_dict[filename],\n",
-    "        }\n",
-    "        ner_annotations.append(annotation)\n",
-    "\n",
-    "    return ner_annotations\n",
-    "\n",
-    "def _prepare_dataset(annotations, processor, label2id):\n",
-    "    \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
-    "\n",
-    "    def _convert_ner_tags_to_id(ner_tags, label2id):\n",
-    "        return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
-    "\n",
-    "    images = annotations[\"image\"]\n",
-    "    words = annotations[\"tokens\"]\n",
-    "    boxes = annotations[\"bboxes\"]\n",
-    "    # Map over labels and convert to numeric id for each ner_tag\n",
-    "    ner_tags = [\n",
-    "        _convert_ner_tags_to_id(ner_tags, label2id)\n",
-    "        for ner_tags in annotations[\"ner_tags\"]\n",
-    "    ]\n",
+    "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n",
     "\n",
-    "    encoding = processor(\n",
-    "        images,\n",
-    "        words,\n",
-    "        boxes=boxes,\n",
-    "        word_labels=ner_tags,\n",
-    "        truncation=True,\n",
-    "        padding=\"max_length\",\n",
-    "    )\n",
-    "\n",
-    "    return encoding\n",
     "\n",
     "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n",
     "    \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n",
@@ -306,6 +106,43 @@
     "    }"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "39f0cbeb-7895-46bd-97d1-2c74e5265e12",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Load training data asset\n",
+    "\n",
+    "The following cell will load training data from a dagster asset. Using the dagster asset will allow easily caching the training data which can be computationally intensive to produce. When running this notebook in dagster directly, this cell will be replaced by dagster actually materializing the asset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f8df608a-32b7-4795-a670-63a2e8772910",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-03 17:47:13 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mozilla_sec_eia.models.sec10k import defs\n",
+    "\n",
+    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8160263c-8f69-437c-918b-e56ad007961a",
@@ -326,12 +163,120 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fafaf3dc8cfe431b90802b61bfe0acc6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/159 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_270331/790868001.py:94: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
+      "  metric = load_metric(\"seqeval\")\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
+      "  warnings.warn(\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
+      "max_steps is given, it will override any value given in num_train_epochs\n",
+      "2024/10/03 17:52:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='6' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [   6/1000 00:02 < 10:34, 1.57 it/s, Epoch 0.04/8]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run orderly-mare-33 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/a94ac72df36447a489d576ea06a71a4a.\n",
+      "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
+      "2024/10/03 17:52:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/03 17:52:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+     ]
+    },
+    {
+     "ename": "OutOfMemoryError",
+     "evalue": "CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m    106\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m    107\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    108\u001b[0m     args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    113\u001b[0m     compute_metrics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m p: compute_metrics(p, metric\u001b[38;5;241m=\u001b[39mmetric, label_list\u001b[38;5;241m=\u001b[39mLABELS),\n\u001b[1;32m    114\u001b[0m )\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run() \u001b[38;5;28;01mas\u001b[39;00m training_run:\n\u001b[1;32m    117\u001b[0m     \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m     \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m    121\u001b[0m     model \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mtokenizer}\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1936\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1939\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1940\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1941\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1942\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1943\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2341\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2338\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2339\u001b[0m         grad_norm \u001b[38;5;241m=\u001b[39m _grad_norm\n\u001b[0;32m-> 2341\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2343\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_optimizer_step(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2345\u001b[0m optimizer_was_run \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39moptimizer_step_was_skipped\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/accelerate/optimizer.py:172\u001b[0m, in \u001b[0;36mAcceleratedOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    170\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accelerate_step_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    171\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 172\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator_state\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mXLA:\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_state\u001b[38;5;241m.\u001b[39mis_xla_gradients_synced \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:130\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.patch_track_step_called.<locals>.wrap_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    128\u001b[0m opt \u001b[38;5;241m=\u001b[39m opt_ref()\n\u001b[1;32m    129\u001b[0m opt\u001b[38;5;241m.\u001b[39m_opt_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m  \u001b[38;5;66;03m# type: ignore[union-attr]\u001b[39;00m\n\u001b[0;32m--> 130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__get__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mopt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:484\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    479\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    480\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    481\u001b[0m                 \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    482\u001b[0m             )\n\u001b[0;32m--> 484\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    485\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m    487\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:89\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.<locals>._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     87\u001b[0m     torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m     88\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 89\u001b[0m     ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     90\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     91\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:227\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    214\u001b[0m     beta1, beta2 \u001b[38;5;241m=\u001b[39m cast(Tuple[\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mfloat\u001b[39m], group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m    216\u001b[0m     has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m    217\u001b[0m         group,\n\u001b[1;32m    218\u001b[0m         params_with_grad,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    224\u001b[0m         state_steps,\n\u001b[1;32m    225\u001b[0m     )\n\u001b[0;32m--> 227\u001b[0m     \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    230\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    231\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    232\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    233\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    235\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    237\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    238\u001b[0m \u001b[43m        \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    239\u001b[0m \u001b[43m        \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    240\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    241\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    242\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    243\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    244\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    245\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    246\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    247\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    248\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    250\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:161\u001b[0m, in \u001b[0;36m_disable_dynamo_if_unsupported.<locals>.wrapper.<locals>.maybe_fallback\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    159\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 161\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:767\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m    764\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    765\u001b[0m     func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 767\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    768\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    769\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    770\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    771\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    772\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    773\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    774\u001b[0m \u001b[43m    \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    775\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    776\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    777\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    778\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    779\u001b[0m \u001b[43m    \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    780\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    781\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    782\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    783\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    784\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    785\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    786\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:600\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m    598\u001b[0m     exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39m_foreach_sqrt(device_max_exp_avg_sqs)\n\u001b[1;32m    599\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m     exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_sqrt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    602\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)\n\u001b[1;32m    603\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_add_(exp_avg_sq_sqrt, eps)\n",
+      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
+     ]
+    }
+   ],
    "source": [
     "import mlflow\n",
     "from datasets import (\n",
@@ -353,6 +298,11 @@
     "from transformers.data.data_collator import default_data_collator\n",
     "\n",
     "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n",
+    "    BBOX_COLS,\n",
+    "    LABELS,\n",
+    "    get_id_label_conversions,\n",
+    ")\n",
     "\n",
     "load_dotenv()\n",
     "\n",
@@ -360,19 +310,40 @@
     "configure_mlflow()\n",
     "mlflow.set_experiment(\"exhibit21_extraction_test\")\n",
     "\n",
+    "\n",
+    "def _prepare_dataset(annotations, processor, label2id):\n",
+    "    \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
+    "\n",
+    "    def _convert_ner_tags_to_id(ner_tags, label2id):\n",
+    "        return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
+    "\n",
+    "    images = annotations[\"image\"]\n",
+    "    words = annotations[\"tokens\"]\n",
+    "    boxes = annotations[\"bboxes\"]\n",
+    "    # Map over labels and convert to numeric id for each ner_tag\n",
+    "    ner_tags = [\n",
+    "        _convert_ner_tags_to_id(ner_tags, label2id)\n",
+    "        for ner_tags in annotations[\"ner_tags\"]\n",
+    "    ]\n",
+    "\n",
+    "    encoding = processor(\n",
+    "        images,\n",
+    "        words,\n",
+    "        boxes=boxes,\n",
+    "        word_labels=ner_tags,\n",
+    "        truncation=True,\n",
+    "        padding=\"max_length\",\n",
+    "    )\n",
+    "\n",
+    "    return encoding\n",
+    "\n",
     "# Only finetune if configured to do so\n",
     "training_run_id = None\n",
     "if context.op_config[\"uri\"] is None:\n",
+    "    id2label, label2id = get_id_label_conversions(LABELS)\n",
     "    # Change temp_dir to save training data locally for inspection\n",
-    "    with TemporaryDirectory() as temp_dir:\n",
-    "        ner_annotations = format_as_ner_annotations(\n",
-    "            labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n",
-    "            pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n",
-    "            gcs_folder_name=context.op_config[\"training_set\"],\n",
-    "        )\n",
-    "\n",
     "    # Cache/prepare training data\n",
-    "    dataset = Dataset.from_list(ner_annotations)\n",
+    "    dataset = Dataset.from_list(ex21_training_data)\n",
     "\n",
     "    # Load pretrained model\n",
     "    model = LayoutLMv3ForTokenClassification.from_pretrained(\n",
@@ -449,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "42c8e920-d671-40c2-b5db-c43611a33897",
    "metadata": {
     "tags": []
@@ -621,71 +592,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ff844a110fb04ddcbe788e647651786c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/24 20:17:30 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "004ac3503c77461f9ce7938949a660c5",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/24 20:17:52 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
-      "2024/09/24 20:17:52 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev304+g07d500a) contains a local version label (+g07d500a). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev304' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
-      "2024/09/24 20:17:53 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "07a654fa7c914b338b0e9fbc36d48bdd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from PIL import Image\n",
     "\n",
@@ -693,40 +605,9 @@
     "    Ex21CompanyOwnership,\n",
     "    Sec10kExtractionMetadata,\n",
     ")\n",
-    "\n",
-    "\n",
-    "def clean_extracted_df(extracted_df):\n",
-    "    \"\"\"Perform basic cleaning on a dataframe extracted from an Ex. 21.\"\"\"\n",
-    "    if extracted_df.empty:\n",
-    "        return extracted_df\n",
-    "    if \"row\" in extracted_df.columns:\n",
-    "        extracted_df = extracted_df.drop(columns=[\"row\"])\n",
-    "    extracted_df[\"subsidiary\"] = extracted_df[\"subsidiary\"].str.strip().str.lower()\n",
-    "    # strip special chars from the start and end of the string\n",
-    "    extracted_df[\"subsidiary\"] = extracted_df[\"subsidiary\"].str.replace(\n",
-    "        r\"^[^\\w&\\s]+|[^\\w&\\s]+$\", \"\", regex=True\n",
-    "    )\n",
-    "    if \"loc\" in extracted_df.columns:\n",
-    "        extracted_df[\"loc\"] = extracted_df[\"loc\"].str.strip().str.lower()\n",
-    "        extracted_df[\"loc\"] = extracted_df[\"loc\"].str.replace(\n",
-    "            r\"[^a-zA-Z&,\\s]\", \"\", regex=True\n",
-    "        )\n",
-    "    if \"own_per\" in extracted_df.columns:\n",
-    "        # remove special chars and letters\n",
-    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].str.replace(\n",
-    "            r\"[^\\d.]\", \"\", regex=True\n",
-    "        )\n",
-    "        # Find values with multiple decimal points\n",
-    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].str.replace(\n",
-    "            r\"(\\d*\\.\\d+)\\..*\", r\"\\1\", regex=True\n",
-    "        )\n",
-    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].replace(\"\", np.nan)\n",
-    "        extracted_df[\"own_per\"] = extracted_df[\"own_per\"].astype(\n",
-    "            \"float64\", errors=\"ignore\"\n",
-    "        )\n",
-    "    # drop rows that have a null subsidiary value\n",
-    "    extracted_df = extracted_df.dropna(subset=\"subsidiary\")\n",
-    "    return extracted_df\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n",
+    "    clean_extracted_df,\n",
+    ")\n",
     "\n",
     "# If a model was trained in this notebook, use it. Otherwise, use\n",
     "if training_run_id is not None:\n",
@@ -797,47 +678,37 @@
   {
    "cell_type": "markdown",
    "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "### Model Evaluation\n",
     "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0bd74bdc-bb63-4ad2-82ec-3dfcf93a6121",
+   "metadata": {},
+   "source": [
+    "#### Load validation data\n",
+    "Next, load an inference dataset containing validation data. This dataset is formatted exactly the same as those that will feed into the `Ex21Extractor` during a production run, but contain only data from the validation set. When creating inference datasets we also produce a metadata dataframe documenting any filings that couldn't be parsed/converted to a PDF. This dataframe should be empty for the validation set, but we will still load it for consistency with production runs."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "47c19b41-131f-4059-8f42-931237565a20",
    "metadata": {
-    "tags": []
+    "tags": [
+     "parameters"
+    ]
    },
    "outputs": [],
    "source": [
-    "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n",
-    "    \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n",
-    "    validation_df = validation_df.rename(\n",
-    "        columns={\n",
-    "            \"Filename\": \"id\",\n",
-    "            \"Subsidiary\": \"subsidiary\",\n",
-    "            \"Location of Incorporation\": \"loc\",\n",
-    "            \"Ownership Percentage\": \"own_per\",\n",
-    "        }\n",
-    "    )\n",
-    "    validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n",
-    "    validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n",
-    "    validation_df = clean_extracted_df(validation_df)\n",
-    "    return validation_df\n",
-    "\n",
-    "# Load labeled validation set\n",
-    "validation_set = clean_ex21_validation_set(\n",
-    "    validation_helpers.load_validation_data(\"ex21_labels.csv\")\n",
-    ")\n",
-    "\n",
-    "# Get filing metadata for filings in validation set\n",
-    "cloud_interface = GCSArchive()\n",
-    "filing_metadata = cloud_interface.get_metadata()\n",
-    "ex21_validation_filing_metadata = filing_metadata[\n",
-    "    filing_metadata.index.isin(validation_set[\"filename\"].unique())\n",
-    "]"
+    "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
+    "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
+    "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")"
    ]
   },
   {
@@ -845,123 +716,20 @@
    "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",
    "metadata": {},
    "source": [
-    "Next define methods evaluating model output, then run extraction and log in child run."
+    "Next define method method for computing validation metrics. The metrics computed above for training are looking at bounding boxes output by `layoutlm` and pertain to one word at a time. These metrics will look at an entire table produced the inference pipeline and compare to the validation data. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/24 20:18:01 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
-      "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n",
-      "/home/zach/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:51: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
-      "  padded_validation_set = pd.concat(\n",
-      "/home/zach/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:44: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
-      "  padded_compute_set = pd.concat(\n",
-      "/home/zach/miniforge3/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "02516db30cd241ed97c08df920368bf8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/24 20:19:33 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
-      "2024/09/24 20:19:33 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev304+g07d500a) contains a local version label (+g07d500a). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev304' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
-      "2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-finch-744 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/b959cfa0ba3c4b91a0f8fe158cd0109f.\n",
-      "2024/09/24 20:19:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
-      "2024/09/24 20:19:41 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/09/24 20:19:42 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from mlflow.models import infer_signature\n",
     "\n",
-    "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n",
-    "\n",
     "\n",
     "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n",
     "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
@@ -1029,19 +797,30 @@
     "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
     "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
     "        },\n",
-    "    )\n",
-    "\n",
-    "\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea",
+   "metadata": {},
+   "source": [
+    "#### Validate model\n",
+    "Finally, run the full model on the validation set and log metrics to mlflow. The logged metrics/model will appear in a nested run below the training run used for the current version of the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
-    "    failed_metadata, dataset = create_inference_dataset(\n",
-    "        filing_metadata=ex21_validation_filing_metadata,\n",
-    "        cloud_interface=cloud_interface,\n",
-    "        has_labels=False,\n",
-    "    )\n",
-    "    metadata, extracted = ex21_extraction_model.predict(dataset.copy())\n",
-    "    metadata = pd.concat([failed_metadata, metadata])\n",
+    "    metadata, extracted = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",
+    "    metadata = pd.concat([ex21_failed_parsing_metadata, metadata])\n",
     "\n",
-    "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n",
+    "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, ex21_validation_set)\n",
     "    mlflow.log_metrics(metrics)\n",
     "    mlflow.pyfunc.log_model(\n",
     "        \"exhibit21_extractor\",\n",
@@ -1050,14 +829,6 @@
     "        signature=infer_signature(dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
     "    )"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb
deleted file mode 100644
index 5c33d22..0000000
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb
+++ /dev/null
@@ -1,1045 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0da8c588-2d09-464b-945f-168704c0cdac",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Exhibit 21 extraction\n",
-    "\n",
-    "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n",
-    "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n",
-    "company."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "import dagstermill\n",
-    "\n",
-    "context = dagstermill.get_context(op_config={\n",
-    "    \"train_model\": True,\n",
-    "    \"model_version\": \"latest\",\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7f299b2b-2358-4526-b023-f29c817316d9",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Train Layoutlmv3"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### Setup training/test sets"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b",
-   "metadata": {},
-   "source": [
-    "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n",
-    "\n",
-    "First define several helper functions to do the conversion."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "from tempfile import TemporaryDirectory\n",
-    "\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from mozilla_sec_eia.library import validation_helpers\n",
-    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n",
-    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n",
-    "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n",
-    "    get_pdf_data_from_path,\n",
-    "    render_page,\n",
-    ")\n",
-    "\n",
-    "# Set some constants\n",
-    "LABELS = [\n",
-    "    \"O\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"I-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"I-Own_Per\",\n",
-    "]\n",
-    "LABEL_PRIORITY = [\n",
-    "    \"I-Subsidiary\",\n",
-    "    \"I-Loc\",\n",
-    "    \"I-Own_Per\",\n",
-    "    \"B-Subsidiary\",\n",
-    "    \"B-Loc\",\n",
-    "    \"B-Own_Per\",\n",
-    "    \"O\",\n",
-    "]\n",
-    "\n",
-    "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n",
-    "BBOX_COLS_PDF = [\n",
-    "    \"top_left_x_pdf\",\n",
-    "    \"top_left_y_pdf\",\n",
-    "    \"bottom_right_x_pdf\",\n",
-    "    \"bottom_right_y_pdf\",\n",
-    "]\n",
-    "\n",
-    "# Map back and forth between id's and labels\n",
-    "id2label = dict(enumerate(LABELS))\n",
-    "label2id = {v: k for k, v in enumerate(LABELS)}\n",
-    "\n",
-    "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n",
-    "    # TODO: for now CIK is stored as an int, update when fixed\n",
-    "    cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n",
-    "    return cik in tracking_df.CIK.unique()\n",
-    "\n",
-    "\n",
-    "def format_label_studio_output(\n",
-    "    labeled_json_dir: Path,\n",
-    "    pdfs_dir: Path,\n",
-    ") -> pd.DataFrame:\n",
-    "    \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n",
-    "    labeled_df = pd.DataFrame()\n",
-    "    # TODO: make this path stuff less janky?\n",
-    "    tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n",
-    "    for json_filename in os.listdir(labeled_json_dir):\n",
-    "        if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n",
-    "            continue\n",
-    "        json_file_path = labeled_json_dir / json_filename\n",
-    "        with Path.open(json_file_path) as j:\n",
-    "            doc_dict = json.loads(j.read())\n",
-    "\n",
-    "        filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n",
-    "        # check if old local naming schema is being used\n",
-    "        if len(filename.split(\"-\")) == 6:\n",
-    "            filename = \"-\".join(filename.split(\"-\")[2:])\n",
-    "        if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n",
-    "            continue\n",
-    "\n",
-    "        pdf_filename = filename + \".pdf\"\n",
-    "        src_path = pdfs_dir / pdf_filename\n",
-    "        extracted, pg = get_pdf_data_from_path(src_path)\n",
-    "        txt = extracted[\"pdf_text\"]\n",
-    "        pg_meta = extracted[\"page\"]\n",
-    "        # normalize bboxes between 0 and 1000 for Hugging Face\n",
-    "        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n",
-    "        # parse the output dictionary of labeled bounding boxes from Label Studio\n",
-    "        doc_df = pd.DataFrame()\n",
-    "        for item in doc_dict[\"result\"]:\n",
-    "            value = item[\"value\"]\n",
-    "            # sometimes Label Studio will fill in an empty list as a label\n",
-    "            # when there is really no label\n",
-    "            # TODO: do this without dict comprehension?\n",
-    "            if (\"labels\" in value) and value[\"labels\"] == []:\n",
-    "                value = {k: v for k, v in value.items() if k != \"labels\"}\n",
-    "            ind = int(item[\"id\"].split(\"_\")[-1])\n",
-    "            doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n",
-    "\n",
-    "        # combine the bounding boxes for each word\n",
-    "        doc_df = doc_df.groupby(level=0).first()\n",
-    "        txt.loc[:, \"id\"] = filename\n",
-    "        # TODO: probably want to filter out these empty Ex. 21 docs\n",
-    "        # the doc might not have any labels in it if it was an empty Ex. 21\n",
-    "        if \"labels\" not in doc_df:\n",
-    "            doc_df.loc[:, \"labels\"] = pd.Series()\n",
-    "\n",
-    "        output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n",
-    "        labeled_df = pd.concat([labeled_df, output_df])\n",
-    "\n",
-    "    # fill in unlabeled words and clean up labeled dataframe\n",
-    "    labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n",
-    "    labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n",
-    "    non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n",
-    "    labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n",
-    "\n",
-    "    # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n",
-    "    # that no value is above 1000 or below 0\n",
-    "\n",
-    "    return labeled_df\n",
-    "\n",
-    "\n",
-    "def get_image_dict(pdfs_dir):\n",
-    "    \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n",
-    "    image_dict = {}\n",
-    "    for pdf_filename in os.listdir(pdfs_dir):\n",
-    "        if pdf_filename.split(\".\")[-1] != \"pdf\":\n",
-    "            continue\n",
-    "        pdf_file_path = pdfs_dir / pdf_filename\n",
-    "        _, pg = get_pdf_data_from_path(pdf_file_path)\n",
-    "        full_pg_img = render_page(pg)\n",
-    "        filename = pdf_filename.split(\".\")[0]\n",
-    "        image_dict[filename] = full_pg_img\n",
-    "    return image_dict\n",
-    "\n",
-    "\n",
-    "def format_as_ner_annotations(\n",
-    "    labeled_json_path: Path,\n",
-    "    pdfs_path: Path,\n",
-    "    gcs_folder_name: Path,\n",
-    ") -> list[dict]:\n",
-    "    \"\"\"Format a Label Studio output JSONs as NER annotations.\n",
-    "\n",
-    "    Formats the dataframe as named entity recognition annotations.\n",
-    "    # TODO: say more about this format\n",
-    "\n",
-    "    Returns:\n",
-    "        ner_annotations: a list of dicts, with one dict for each doc.\n",
-    "    \"\"\"\n",
-    "    GCSArchive().cache_training_data(\n",
-    "        json_cache_path=labeled_json_path,\n",
-    "        pdf_cache_path=pdfs_path,\n",
-    "        gcs_folder_name=gcs_folder_name\n",
-    "    )\n",
-    "\n",
-    "    labeled_df = format_label_studio_output(\n",
-    "        labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n",
-    "    )\n",
-    "    # convert dataframe/dictionary into NER format\n",
-    "    # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n",
-    "    # complete dataset is a list of dicts, with one dict for each doc\n",
-    "    doc_filenames = labeled_df[\"id\"].unique()\n",
-    "    image_dict = get_image_dict(pdfs_dir=pdfs_path)\n",
-    "    ner_annotations = []\n",
-    "    for filename in doc_filenames:\n",
-    "        annotation = {\n",
-    "            \"id\": filename,\n",
-    "            \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n",
-    "            \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n",
-    "            \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n",
-    "            .to_numpy()\n",
-    "            .tolist(),\n",
-    "            \"image\": image_dict[filename],\n",
-    "        }\n",
-    "        ner_annotations.append(annotation)\n",
-    "\n",
-    "    return ner_annotations\n",
-    "\n",
-    "def _prepare_dataset(annotations, processor, label2id):\n",
-    "    \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n",
-    "\n",
-    "    def _convert_ner_tags_to_id(ner_tags, label2id):\n",
-    "        return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n",
-    "\n",
-    "    images = annotations[\"image\"]\n",
-    "    words = annotations[\"tokens\"]\n",
-    "    boxes = annotations[\"bboxes\"]\n",
-    "    # Map over labels and convert to numeric id for each ner_tag\n",
-    "    ner_tags = [\n",
-    "        _convert_ner_tags_to_id(ner_tags, label2id)\n",
-    "        for ner_tags in annotations[\"ner_tags\"]\n",
-    "    ]\n",
-    "\n",
-    "    encoding = processor(\n",
-    "        images,\n",
-    "        words,\n",
-    "        boxes=boxes,\n",
-    "        word_labels=ner_tags,\n",
-    "        truncation=True,\n",
-    "        padding=\"max_length\",\n",
-    "    )\n",
-    "\n",
-    "    return encoding\n",
-    "\n",
-    "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n",
-    "    \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n",
-    "    predictions, labels = p\n",
-    "    predictions = np.argmax(predictions, axis=2)\n",
-    "\n",
-    "    # Remove ignored index (special tokens)\n",
-    "    true_predictions = [\n",
-    "        [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
-    "        for prediction, label in zip(predictions, labels)\n",
-    "    ]\n",
-    "    true_labels = [\n",
-    "        [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n",
-    "        for prediction, label in zip(predictions, labels)\n",
-    "    ]\n",
-    "\n",
-    "    results = metric.compute(predictions=true_predictions, references=true_labels)\n",
-    "    if return_entity_level_metrics:\n",
-    "        # Unpack nested dictionaries\n",
-    "        final_results = {}\n",
-    "        for key, value in results.items():\n",
-    "            if isinstance(value, dict):\n",
-    "                for n, v in value.items():\n",
-    "                    final_results[f\"{key}_{n}\"] = v\n",
-    "            else:\n",
-    "                final_results[key] = value\n",
-    "        return final_results\n",
-    "    return {\n",
-    "        \"precision\": results[\"overall_precision\"],\n",
-    "        \"recall\": results[\"overall_recall\"],\n",
-    "        \"f1\": results[\"overall_f1\"],\n",
-    "        \"accuracy\": results[\"overall_accuracy\"],\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8160263c-8f69-437c-918b-e56ad007961a",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Finetune Model\n",
-    "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n",
-    "\n",
-    "Model training contains several steps implemented below:\n",
-    "1. Use temporary path to convert filings to PDF's and stash labels\n",
-    "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n",
-    "3. Construct huggingface dataset from NER annotations and split into train and test sets\n",
-    "4. Load pretrained model from huggingface\n",
-    "5. Finetune model on training data and evaluate on test data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<table> is empty\n",
-      "'<c> The Southwest Companies Nevada PriMerit Bank Federally chartered stock savings bank Paiute Pipeline Company Nevada Carson Water Company Nevada Southwest Gas Transmission Company Partnership between Southwest Gas Corporation and Utility Financial Corp. Utility Financial Corp. Nevada Southwest Gas Corporation of Arizona Nevada PRIMERIT BANK SUBSIDIARIES AT DECEMBER 31, 1993'\n",
-      "<table> is empty\n",
-      "'<c> TCA Management Company.................................................... Texas Teleservice Corporation of America........................................ Texas Texas Community Antennas, Inc............................................. Texas Texas Telecable, Inc...................................................... Texas TCA Cable of Amarillo, Inc................................................ Texas Telecable Associates, Inc................................................. Texas Delta Cablevision, Inc.................................................... Arkansas Sun Valley Cablevision, Inc............................................... Idaho VPI Communications, Inc................................................... Texas AvComm Corporation........................................................ Texas Tele-Communications of Arkansas L. P......................................'\n",
-      "<table> is empty\n",
-      "'<c> DOMESTIC SUBSIDIARIES International Sales &amp; Business, Inc. California KLA-Tencor Building Corporation California KLA-Tencor Disc Corporation California KLA-Tencor International Corporation California KLA-Tencor Klinnik Corporation California KLA-Tencor Management Corporation California KLA-Tencor (Thailand Branch) Corporation California VLSI Standards, Inc. California Amray, Inc. Delaware Groff Associates, Inc. California DeviceWare, Inc. California INTERNATIONAL SUBSIDIARIES'\n",
-      "<table> is empty\n",
-      "'<c> 1. Northeast Energy, LLC (100%-Owned) .................................................... Florida 2. Northeast Energy Associates, A Limited Partnership (99%-Owned) (a) .................... Massachusetts 3. North Jersey Energy Associates, A Limited Partnership (99%-Owned) (a) ................. New Jersey (a) Northeast Energy, LLC owns the remaining 1% interest. </c>'\n",
-      "<table> is empty\n",
-      "'<c> 1. ESI Tractebel Urban Renewal Corporation (100%-Owned) .................................. New Jersey </c>'\n",
-      "<table> is empty\n",
-      "'<c> IVANHOE ENERGY HOLDINGS INC. (Nevada) 100% IVANHOE ENERGY (USA) INC. (Nevada) 100% (indirect) IVANHOE ENERGY ROYALTY INC. (Nevada) 100% (indirect) IVANHOE ENERGY INTERNATIONAL VENTURES INC. (BVI) 100% Ivanhoe Energy Sweetwater Limited (Malta) 100% (Indirect) Ivanhoe Energy (Qatar) Inc. (BVI) 100% (Indirect) GTL Japan Corporation (Japan) 100% (Indirect) IVANHOE ENERGY'\n",
-      "<table> is empty\n",
-      "'<c> Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Mid America, Inc. DE Airgas Mid South, Inc. DE Airgas Nor Pac, Inc. DE'\n",
-      "<table> is empty\n",
-      "'<c> Subsidiary Name State of Formation - --------------- ------------------- American Ecology Environmental Services Corporation Texas Corporation American Ecology Holdings Corporation Delaware Corporation American Ecology Recycle Center, Inc. Delaware Corporation American Ecology Services Corporation Delaware Corporation Texas Ecologists, Inc. Texas Corporation US Ecology, Inc. California Corporation US Ecology Idaho, Inc. Delaware'\n",
-      "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bae617cb831d4b2593c0fa4a874f1592",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map:   0%|          | 0/159 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
-      "  warnings.warn(\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "max_steps is given, it will override any value given in num_train_epochs\n",
-      "2024/09/23 14:14:48 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='2' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [   2/1000 : < :, Epoch 0.01/8]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       " <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "      <th>Validation Loss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-trout-555 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/573e64992704411c9013937d849e1504.\n",
-      "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
-      "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
-     ]
-    },
-    {
-     "ename": "OutOfMemoryError",
-     "evalue": "CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 94\u001b[0m\n\u001b[1;32m     91\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mset_experiment(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexhibit21_extraction_test\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run():\n\u001b[1;32m     93\u001b[0m     \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m---> 94\u001b[0m     \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m     97\u001b[0m     mlflow\u001b[38;5;241m.\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mlog_model(\n\u001b[1;32m     98\u001b[0m         trainer, artifact_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m, task\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken-classification\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     99\u001b[0m     )\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1936\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1939\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1940\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1941\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1942\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1943\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2279\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2276\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2278\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2279\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   2282\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   2283\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m   2284\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   2285\u001b[0m ):\n\u001b[1;32m   2286\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   2287\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3318\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   3315\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   3317\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3318\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3320\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m   3321\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   3322\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   3323\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m   3324\u001b[0m ):\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3363\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   3361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   3362\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3363\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3364\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   3365\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   3366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:1099\u001b[0m, in \u001b[0;36mLayoutLMv3ForTokenClassification.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)\u001b[0m\n\u001b[1;32m   1069\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1070\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m   1071\u001b[0m \u001b[38;5;124;03m    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1095\u001b[0m \u001b[38;5;124;03m>>> logits = outputs.logits\u001b[39;00m\n\u001b[1;32m   1096\u001b[0m \u001b[38;5;124;03m```\"\"\"\u001b[39;00m\n\u001b[1;32m   1097\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1099\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayoutlmv3\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1100\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1101\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1102\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1104\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1105\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1106\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1107\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1108\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1109\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1110\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1111\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1113\u001b[0m     input_shape \u001b[38;5;241m=\u001b[39m input_ids\u001b[38;5;241m.\u001b[39msize()\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:975\u001b[0m, in \u001b[0;36mLayoutLMv3Model.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    968\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m    969\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m    970\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m    971\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m    972\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m    973\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m--> 975\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    976\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    977\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_bbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    978\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_position_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    979\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    980\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    981\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    982\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    983\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    984\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpatch_height\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_height\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    985\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpatch_width\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_width\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    986\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    990\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:681\u001b[0m, in \u001b[0;36mLayoutLMv3Encoder.forward\u001b[0;34m(self, hidden_states, bbox, attention_mask, head_mask, output_attentions, output_hidden_states, return_dict, position_ids, patch_height, patch_width)\u001b[0m\n\u001b[1;32m    671\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m    672\u001b[0m         layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m    673\u001b[0m         hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    678\u001b[0m         rel_2d_pos,\n\u001b[1;32m    679\u001b[0m     )\n\u001b[1;32m    680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m     layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    682\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    683\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    684\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    685\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    686\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    687\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    688\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    690\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:532\u001b[0m, in \u001b[0;36mLayoutLMv3Layer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    523\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    524\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    525\u001b[0m     hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    530\u001b[0m     rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    531\u001b[0m ):\n\u001b[0;32m--> 532\u001b[0m     self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    533\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    534\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    535\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    536\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    537\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    538\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    539\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    540\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m    542\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add self attentions if we output attention weights\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:500\u001b[0m, in \u001b[0;36mLayoutLMv3Attention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m    492\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    493\u001b[0m     hidden_states,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    498\u001b[0m     rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    499\u001b[0m ):\n\u001b[0;32m--> 500\u001b[0m     self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    501\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    502\u001b[0m \u001b[43m        \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    503\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    504\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    505\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    506\u001b[0m \u001b[43m        \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    507\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    508\u001b[0m     attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m    509\u001b[0m     outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:]  \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:448\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m    444\u001b[0m     attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m+\u001b[39m attention_mask\n\u001b[1;32m    446\u001b[0m \u001b[38;5;66;03m# Normalize the attention scores to probabilities.\u001b[39;00m\n\u001b[1;32m    447\u001b[0m \u001b[38;5;66;03m# Use the trick of the CogView paper to stablize training\u001b[39;00m\n\u001b[0;32m--> 448\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcogview_attention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattention_scores\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    450\u001b[0m \u001b[38;5;66;03m# This is actually dropping out entire tokens to attend to, which might\u001b[39;00m\n\u001b[1;32m    451\u001b[0m \u001b[38;5;66;03m# seem a bit unusual, but is taken from the original Transformer paper.\u001b[39;00m\n\u001b[1;32m    452\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(attention_probs)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:414\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.cogview_attention\u001b[0;34m(self, attention_scores, alpha)\u001b[0m\n\u001b[1;32m    412\u001b[0m scaled_attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m/\u001b[39m alpha\n\u001b[1;32m    413\u001b[0m max_value \u001b[38;5;241m=\u001b[39m scaled_attention_scores\u001b[38;5;241m.\u001b[39mamax(dim\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 414\u001b[0m new_attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43m(\u001b[49m\u001b[43mscaled_attention_scores\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_value\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\n\u001b[1;32m    415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m nn\u001b[38;5;241m.\u001b[39mSoftmax(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)(new_attention_scores)\n",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
-     ]
-    }
-   ],
-   "source": [
-    "import mlflow\n",
-    "from datasets import (\n",
-    "    Array2D,\n",
-    "    Array3D,\n",
-    "    Dataset,\n",
-    "    Features,\n",
-    "    Sequence,\n",
-    "    Value,\n",
-    "    load_metric,\n",
-    ")\n",
-    "from dotenv import load_dotenv\n",
-    "from transformers import (\n",
-    "    AutoProcessor,\n",
-    "    LayoutLMv3ForTokenClassification,\n",
-    "    Trainer,\n",
-    "    TrainingArguments,\n",
-    ")\n",
-    "from transformers.data.data_collator import default_data_collator\n",
-    "\n",
-    "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
-    "\n",
-    "load_dotenv()\n",
-    "\n",
-    "\n",
-    "configure_mlflow()\n",
-    "\n",
-    "# Only finetune if configured to do so\n",
-    "if context.op_config[\"train_model\"]:\n",
-    "    # Change temp_dir to save training data locally for inspection\n",
-    "    with TemporaryDirectory() as temp_dir:\n",
-    "        ner_annotations = format_as_ner_annotations(\n",
-    "            labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n",
-    "            pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n",
-    "            gcs_folder_name=\"labeledv0.2/\",\n",
-    "        )\n",
-    "\n",
-    "    # Cache/prepare training data\n",
-    "    dataset = Dataset.from_list(ner_annotations)\n",
-    "\n",
-    "    # Load pretrained model\n",
-    "    model = LayoutLMv3ForTokenClassification.from_pretrained(\n",
-    "        \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n",
-    "    )\n",
-    "    processor = AutoProcessor.from_pretrained(\n",
-    "        \"microsoft/layoutlmv3-base\", apply_ocr=False\n",
-    "    )\n",
-    "\n",
-    "    # Prepare our train & eval dataset\n",
-    "    column_names = dataset.column_names\n",
-    "    features = Features(\n",
-    "        {\n",
-    "            \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n",
-    "            \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n",
-    "            \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n",
-    "            \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n",
-    "            \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n",
-    "        }\n",
-    "    )\n",
-    "    dataset = dataset.map(\n",
-    "        lambda annotations: _prepare_dataset(annotations, processor, label2id),\n",
-    "        batched=True,\n",
-    "        remove_columns=column_names,\n",
-    "        features=features,\n",
-    "    )\n",
-    "    dataset.set_format(\"torch\")\n",
-    "    split_dataset = dataset.train_test_split(test_size=0.2)\n",
-    "    train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n",
-    "\n",
-    "    # Initialize our Trainer\n",
-    "    metric = load_metric(\"seqeval\")\n",
-    "    training_args = TrainingArguments(\n",
-    "        max_steps=1000,\n",
-    "        per_device_train_batch_size=1,\n",
-    "        per_device_eval_batch_size=1,\n",
-    "        learning_rate=1e-5,\n",
-    "        evaluation_strategy=\"steps\",\n",
-    "        eval_steps=100,\n",
-    "        load_best_model_at_end=True,\n",
-    "        metric_for_best_model=\"f1\",\n",
-    "        output_dir=\"./layoutlm\",\n",
-    "    )\n",
-    "    trainer = Trainer(\n",
-    "        model=model,\n",
-    "        args=training_args,\n",
-    "        train_dataset=train_dataset,\n",
-    "        eval_dataset=eval_dataset,\n",
-    "        tokenizer=processor,\n",
-    "        data_collator=default_data_collator,\n",
-    "        compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n",
-    "    )\n",
-    "\n",
-    "    mlflow.set_experiment(\"exhibit21_extraction_test\")\n",
-    "    with mlflow.start_run():\n",
-    "        # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n",
-    "        trainer.train()\n",
-    "\n",
-    "        # Log finetuend model with mlflow\n",
-    "        mlflow.transformers.log_model(\n",
-    "            trainer, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n",
-    "        )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e",
-   "metadata": {},
-   "source": [
-    "## Model inference\n",
-    "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "42c8e920-d671-40c2-b5db-c43611a33897",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from transformers import Pipeline, pipeline\n",
-    "from transformers.tokenization_utils_base import BatchEncoding\n",
-    "\n",
-    "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n",
-    "    iob_to_label,\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def _sort_by_label_priority(target_array):\n",
-    "    id_priority = [label2id[label] for label in LABEL_PRIORITY]\n",
-    "    # Create a priority map from the label priority\n",
-    "    priority_map = {val: idx for idx, val in enumerate(id_priority)}\n",
-    "    # Sort the target array based on the priority map\n",
-    "    sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float(\"inf\")))\n",
-    "    return sorted_array\n",
-    "\n",
-    "def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):\n",
-    "    \"\"\"Get the mode prediction for each box in an Ex. 21.\n",
-    "\n",
-    "    When handling multi page documents LayoutLM uses a sliding 'frame'\n",
-    "    with some overlap between frames. The overlap creates multiple\n",
-    "    predictions for the same bounding boxes. Thus it's necessary to find\n",
-    "    the mode of all the predictions for a bounding box and use that as the\n",
-    "    single prediction for each box. If there are multiple mode\n",
-    "    predictions for a bounding box, then ties are broken by setting\n",
-    "    a priority for the labels (LABEL_PRIORITY) and choosing the highest priority\n",
-    "    label.\n",
-    "    \"\"\"\n",
-    "    # Flatten the tensors\n",
-    "    flat_token_boxes = token_boxes_tensor.view(-1, 4)\n",
-    "    flat_predictions = predictions_tensor.view(-1)\n",
-    "\n",
-    "    boxes = flat_token_boxes.numpy()\n",
-    "    predictions = flat_predictions.numpy()\n",
-    "\n",
-    "    # Find unique boxes and indices\n",
-    "    unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)\n",
-    "\n",
-    "    # Compute the mode for each unique bounding box\n",
-    "    # for each unique box in boxes, create a list with all predictions for that box\n",
-    "    # get the indices in predictions where the corresponding index in boxes is\n",
-    "    unique_box_predictions = [\n",
-    "        predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))\n",
-    "    ]\n",
-    "    pred_counts = [np.bincount(arr) for arr in unique_box_predictions]\n",
-    "    # Compute the mode of predictions for each group\n",
-    "    # break ties by taking into account LABEL_PRIORITY\n",
-    "    modes = np.array(\n",
-    "        [\n",
-    "            _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]\n",
-    "            for arr in pred_counts\n",
-    "        ]\n",
-    "    )\n",
-    "    flattened_modes = modes[inverse_indices]\n",
-    "\n",
-    "    return flattened_modes\n",
-    "\n",
-    "class LayoutLMInferencePipeline(Pipeline):\n",
-    "    \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
-    "\n",
-    "    def __init__(self, *args, **kwargs):\n",
-    "        \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n",
-    "        super().__init__(*args, **kwargs)\n",
-    "\n",
-    "    def _sanitize_parameters(self, **kwargs):\n",
-    "        preprocess_kwargs = {}\n",
-    "        if \"maybe_arg\" in kwargs:\n",
-    "            preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n",
-    "        return preprocess_kwargs, {}, {}\n",
-    "\n",
-    "    def preprocess(self, doc_dict):\n",
-    "        \"\"\"Encode and tokenize model inputs.\"\"\"\n",
-    "        image = doc_dict[\"image\"]\n",
-    "        words = doc_dict[\"tokens\"]\n",
-    "        boxes = doc_dict[\"bboxes\"]\n",
-    "        encoding = self.tokenizer(\n",
-    "            image,\n",
-    "            words,\n",
-    "            boxes=boxes,\n",
-    "            return_tensors=\"pt\",\n",
-    "            truncation=True,\n",
-    "            padding=\"max_length\",\n",
-    "            max_length=512,  # this is the maximum max_length\n",
-    "            stride=128,\n",
-    "            return_offsets_mapping=True,\n",
-    "            return_overflowing_tokens=True,\n",
-    "        )\n",
-    "        model_inputs = {}\n",
-    "        model_inputs[\"raw_encoding\"] = encoding.copy()\n",
-    "        model_inputs[\"doc_dict\"] = doc_dict\n",
-    "        model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n",
-    "        model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n",
-    "        # TODO: do we actually need to make these into ints?\n",
-    "        encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n",
-    "        encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n",
-    "        encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n",
-    "        encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n",
-    "        model_inputs[\"encoding\"] = encoding\n",
-    "        return model_inputs\n",
-    "\n",
-    "    def _forward(self, model_inputs):\n",
-    "        # encoding is passed as a UserDict in the model_inputs dictionary\n",
-    "        # turn it back into a BatchEncoding\n",
-    "        encoding = BatchEncoding(model_inputs[\"encoding\"])\n",
-    "        if torch.cuda.is_available():\n",
-    "            encoding.to(\"cuda\")\n",
-    "            self.model.to(\"cuda\")\n",
-    "        # since we're doing inference, we don't need gradient computation\n",
-    "        with torch.no_grad():\n",
-    "            output = self.model(**encoding)\n",
-    "            return {\n",
-    "                \"logits\": output.logits,\n",
-    "                \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n",
-    "                \"raw_encoding\": model_inputs[\"raw_encoding\"],\n",
-    "                \"doc_dict\": model_inputs[\"doc_dict\"],\n",
-    "            }\n",
-    "\n",
-    "    def postprocess(self, all_outputs):\n",
-    "        \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n",
-    "        logits = all_outputs[\"logits\"]\n",
-    "        predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n",
-    "        output_df = self.extract_table(all_outputs)\n",
-    "        return logits, predictions, output_df\n",
-    "\n",
-    "    def extract_table(self, all_outputs):\n",
-    "        \"\"\"Extract a structured table from a set of inference predictions.\n",
-    "\n",
-    "        This function essentially works by stacking bounding boxes and predictions\n",
-    "        into a dataframe and going from left to right and top to bottom. Then, every\n",
-    "        every time a new subsidiary entity is encountered, it assigns a new group or\n",
-    "        \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n",
-    "        entities in between these subsidiary groups are assigned to a subsidiary row/group.\n",
-    "        Finally, this is all formatted into a dataframe with an ID column from the original\n",
-    "        filename and a basic cleaning function normalizes strings.\n",
-    "        \"\"\"\n",
-    "        # TODO: when model more mature, break this into sub functions to make it\n",
-    "        # clearer what's going on\n",
-    "        predictions = all_outputs[\"predictions\"]\n",
-    "        encoding = all_outputs[\"raw_encoding\"]\n",
-    "        doc_dict = all_outputs[\"doc_dict\"]\n",
-    "\n",
-    "        token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n",
-    "        predictions_tensor = torch.tensor(predictions)\n",
-    "        mode_predictions = get_flattened_mode_predictions(\n",
-    "            token_boxes_tensor, predictions_tensor\n",
-    "        )\n",
-    "        token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n",
-    "        predicted_labels = [\n",
-    "            self.model.config.id2label[pred] for pred in mode_predictions\n",
-    "        ]\n",
-    "        simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n",
-    "\n",
-    "        df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n",
-    "        df.loc[:, \"iob_pred\"] = predicted_labels\n",
-    "        df.loc[:, \"pred\"] = simple_preds\n",
-    "        invalid_mask = (\n",
-    "            (df[\"top_left_x\"] == 0)\n",
-    "            & (df[\"top_left_y\"] == 0)\n",
-    "            & (df[\"bottom_right_x\"] == 0)\n",
-    "            & (df[\"bottom_right_y\"] == 0)\n",
-    "        )\n",
-    "        df = df[~invalid_mask]\n",
-    "        # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n",
-    "        # subwords from the same word share the same bounding box coordinates\n",
-    "        # so we merge the original words onto our dataframe on bbox coordinates\n",
-    "        words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n",
-    "        words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n",
-    "        df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n",
-    "            subset=BBOX_COLS + [\"pred\", \"word\"]\n",
-    "        )\n",
-    "        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n",
-    "        # should always have a B entity label. Manually override labels so this is true.\n",
-    "        first_in_group_df = df[\n",
-    "            (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n",
-    "        ]\n",
-    "        first_in_group_df.loc[:, \"iob_pred\"] = (\n",
-    "            \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n",
-    "        )\n",
-    "        df.update(first_in_group_df)\n",
-    "        # filter for just words that were labeled with non \"other\" entities\n",
-    "        entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
-    "        entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n",
-    "        # words are labeled with IOB format which stands for inside, outside, beginning\n",
-    "        # merge B and I entities to form one entity group\n",
-    "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
-    "        entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
-    "        grouped_df = (\n",
-    "            entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n",
-    "            .apply(\" \".join)\n",
-    "            .reset_index()[[\"pred\", \"word\"]]\n",
-    "        )\n",
-    "        # assign a new row every time there's a new subsidiary\n",
-    "        grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n",
-    "        output_df = grouped_df.pivot_table(\n",
-    "            index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n",
-    "        ).reset_index()\n",
-    "        if output_df.empty:\n",
-    "            return output_df\n",
-    "        output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n",
-    "        return output_df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f",
-   "metadata": {},
-   "source": [
-    "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from mozilla_sec_eia.models.sec10k.entities import (\n",
-    "    Ex21CompanyOwnership,\n",
-    "    Sec10kExtractionMetadata,\n",
-    ")\n",
-    "from mozilla_sec_eia.models.sec10k.ex_21.inference import clean_extracted_df\n",
-    "\n",
-    "# Construct model_uri from model_version\n",
-    "model_uri = f\"models:/layoutlm_extractor/{context.op_config['model_version']}\"\n",
-    "model_info = mlflow.models.get_model_info(model_uri)\n",
-    "\n",
-    "def _get_data(dataset):\n",
-    "    yield from dataset\n",
-    "\n",
-    "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n",
-    "    \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n",
-    "    def load_context(self, context):\n",
-    "        \"\"\"Load pretrained model.\"\"\"\n",
-    "        os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
-    "        self.model_components = mlflow.transformers.load_model(\n",
-    "            context.artifacts[\"layoutlm_extractor\"], return_type=\"components\"\n",
-    "        )\n",
-    "\n",
-    "    def predict(self, context, model_input: Dataset, params=None):\n",
-    "        \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n",
-    "        # TODO: figure out device argument\n",
-    "        pipe = pipeline(\n",
-    "            \"token-classification\",\n",
-    "            model=self.model_components[\"model\"],\n",
-    "            tokenizer=self.model_components[\"tokenizer\"],\n",
-    "            pipeline_class=LayoutLMInferencePipeline,\n",
-    "        )\n",
-    "\n",
-    "        logits = []\n",
-    "        predictions = []\n",
-    "        all_output_df = Ex21CompanyOwnership.example(size=0)\n",
-    "        extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n",
-    "        for logit, pred, output_df in pipe(_get_data(model_input)):\n",
-    "            logits.append(logit)\n",
-    "            predictions.append(pred)\n",
-    "            if not output_df.empty:\n",
-    "                filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n",
-    "                extraction_metadata.loc[filename, [\"success\"]] = True\n",
-    "            all_output_df = pd.concat([all_output_df, output_df])\n",
-    "        all_output_df.columns.name = None\n",
-    "        all_output_df = clean_extracted_df(all_output_df)\n",
-    "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n",
-    "        all_output_df = all_output_df.reset_index(drop=True)\n",
-    "        return extraction_metadata, all_output_df\n",
-    "\n",
-    "# Save model to local temp dir with artifacts, then reload for evaluation\n",
-    "with TemporaryDirectory() as tmp_dir:\n",
-    "    mlflow.pyfunc.save_model(\n",
-    "        path=tmp_dir,\n",
-    "        python_model=Ex21Extractor(),\n",
-    "        artifacts={\"model_components\": model_uri},\n",
-    "    )\n",
-    "    ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d",
-   "metadata": {},
-   "source": [
-    "### Model Evaluation\n",
-    "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47c19b41-131f-4059-8f42-931237565a20",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n",
-    "    \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n",
-    "    validation_df = validation_df.rename(\n",
-    "        columns={\n",
-    "            \"Filename\": \"id\",\n",
-    "            \"Subsidiary\": \"subsidiary\",\n",
-    "            \"Location of Incorporation\": \"loc\",\n",
-    "            \"Ownership Percentage\": \"own_per\",\n",
-    "        }\n",
-    "    )\n",
-    "    validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n",
-    "    validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n",
-    "    validation_df = clean_extracted_df(validation_df)\n",
-    "    return validation_df\n",
-    "\n",
-    "# Load labeled validation set\n",
-    "validation_set = clean_ex21_validation_set(\n",
-    "    validation_helpers.load_validation_data(\"ex21_labels.csv\")\n",
-    ")\n",
-    "\n",
-    "# Get filing metadata for filings in validation set\n",
-    "cloud_interface = GCSArchive()\n",
-    "filing_metadata = cloud_interface.get_metadata()\n",
-    "ex21_validation_filing_metadata = filing_metadata[\n",
-    "    filing_metadata.index.isin(validation_set[\"filename\"].unique())\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",
-   "metadata": {},
-   "source": [
-    "Next define methods evaluating model output, then run extraction and log in child run."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n",
-    "\n",
-    "\n",
-    "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n",
-    "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
-    "    shared_cols = validation_df.columns.intersection(computed_df.columns)\n",
-    "    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n",
-    "    n_equal = 0\n",
-    "    validation_filenames = validation_df[\"id\"].unique()\n",
-    "    n_files = len(validation_filenames)\n",
-    "    table_metrics_dict = {}\n",
-    "    jaccard_dict = {}\n",
-    "    incorrect_files = []\n",
-    "    # iterate through each file and check each extracted table\n",
-    "    for filename in validation_filenames:\n",
-    "        extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n",
-    "            drop=True\n",
-    "        )\n",
-    "        validation_table_df = validation_df[\n",
-    "            validation_df[\"id\"] == filename\n",
-    "        ].reset_index(drop=True)\n",
-    "        # check if the tables are exactly equal\n",
-    "        if extracted_table_df.equals(validation_table_df):\n",
-    "            # TODO: strip llc and other company strings before comparison\n",
-    "            n_equal += 1\n",
-    "        else:\n",
-    "            incorrect_files.append(filename)\n",
-    "        # compute precision and recall for each column\n",
-    "        table_metrics_dict[filename] = {}\n",
-    "        jaccard_dict[filename] = {}\n",
-    "        for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
-    "            table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n",
-    "                extracted_table_df, validation_table_df, value_col=col\n",
-    "            )\n",
-    "            table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n",
-    "                \"precision\"\n",
-    "            ]\n",
-    "            table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n",
-    "            # get the jaccard similarity between columns\n",
-    "            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n",
-    "                computed_df=extracted_table_df,\n",
-    "                validation_df=validation_table_df,\n",
-    "                value_col=col,\n",
-    "            )\n",
-    "\n",
-    "    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n",
-    "    prec_recall_df = pd.DataFrame.from_dict(\n",
-    "        table_metrics_dict, orient=\"index\"\n",
-    "    ).reset_index()\n",
-    "\n",
-    "    return (\n",
-    "        jaccard_df,\n",
-    "        prec_recall_df,\n",
-    "        pd.DataFrame({\"filename\": incorrect_files}),\n",
-    "        {\n",
-    "            \"table_accuracy\": n_equal / n_files,\n",
-    "            \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n",
-    "            \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n",
-    "            \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n",
-    "            \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n",
-    "            \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
-    "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
-    "        },\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
-    "    failed_metadata, dataset = create_inference_dataset(\n",
-    "        filings=ex21_validation_filing_metadata,\n",
-    "        cloud_interface=cloud_interface,\n",
-    "        has_labels=True,\n",
-    "    )\n",
-    "    metadata, extracted = ex21_extraction_model.predict(dataset)\n",
-    "    metadata = pd.concat([failed_metadata, metadata])\n",
-    "\n",
-    "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n",
-    "    mlflow.log_metrics(metrics)\n",
-    "    mlflow.pyfunc.log_model(\"exhibit21_extractor\", python_model=ex21_extraction_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45a5b13a-2276-4fb2-80dd-76e3f1184bea",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
index df9be07..62a1cb6 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py
@@ -5,6 +5,7 @@
 """
 
 import logging
+import os
 from typing import Any
 
 import cv2
@@ -418,3 +419,17 @@ def _pil_img_from_pixmap(pix: fitz.Pixmap) -> Image.Image:
 
     img = Image.frombytes(mode, (pix.width, pix.height), pix.samples)
     return img
+
+
+def get_image_dict(pdfs_dir):
+    """Create a dictionary with filenames and their Ex. 21 images."""
+    image_dict = {}
+    for pdf_filename in os.listdir(pdfs_dir):
+        if pdf_filename.split(".")[-1] != "pdf":
+            continue
+        pdf_file_path = pdfs_dir / pdf_filename
+        _, pg = get_pdf_data_from_path(pdf_file_path)
+        full_pg_img = render_page(pg)
+        filename = pdf_filename.split(".")[0]
+        image_dict[filename] = full_pg_img
+    return image_dict
diff --git a/tests/unit/models/sec10k/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py
index 0e89c1e..0b48743 100644
--- a/tests/unit/models/sec10k/ex21_model_test.py
+++ b/tests/unit/models/sec10k/ex21_model_test.py
@@ -11,13 +11,13 @@
     pandas_compute_precision_recall,
     strip_down_company_names,
 )
-from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (
-    clean_ex21_validation_set,
-)
-from mozilla_sec_eia.models.sec10k.ex_21.inference import (
+from mozilla_sec_eia.models.sec10k.ex_21.data.common import (
     LABELS,
     get_flattened_mode_predictions,
 )
+from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (
+    clean_ex21_validation_set,
+)
 from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions
 
 

From d6889e398b304c340239d6f52cb1cd2d7b541e78 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 12:42:23 -0400
Subject: [PATCH 086/161] Minor notebook fixes

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  4 +-
 .../notebooks/exhibit21_extractor.ipynb       | 87 +++++++------------
 2 files changed, 35 insertions(+), 56 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 63097e9..da82dc0 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -53,7 +53,8 @@
 class TrainConfig(Config):
     """Config for training notebook."""
 
-    uri: str = "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor"
+    uri: str | None = None
+    # "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor"
     training_set: str = "labeledv0.2"
 
 
@@ -67,6 +68,7 @@ class TrainConfig(Config):
         "ex21_failed_parsing_metadata": AssetIn(),
         "ex21_inference_dataset": AssetIn(),
     },
+    save_notebook_on_failure=True,
 )
 ex21_training_job = define_asset_job(
     "ex21_training",
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index c03cc4e..ca3ed24 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -14,6 +14,28 @@
     "company."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "84aab877-9d59-4ec7-bf4b-c75e216fb1d6",
+   "metadata": {},
+   "source": [
+    "## Load upstream assets and configuration\n",
+    "The following cell can be run interactively to set configuration and load upstream assets. When running the notebook in dagster, this cell will be replaced with assets from the dagster run and dagster run configuration.\n",
+    "\n",
+    "### Config\n",
+    "- `layoutlm_uri`: If `None` the notebook will finetune layoutlm using `ex21_training_data`. If `layoutlm_uri` points to a valid model on the mlflow tracking server, the notebook will use the pre-trained model and perform inference on the validation set, logging validation metrics to a child run nested under the mlflow run associated with the pretrained model.\n",
+    "\n",
+    "### Upstream assets\n",
+    "We are using dagster assets to construct training/validation data outside the notebook to allow for easy caching. These datasets are fairly compute intensive to create, so this is useful when iterating on the model using the same data.\n",
+    "\n",
+    "NOTE: The notebook will load the most recent version of these assets, so to update the training/validation data you must rerun the dagster assets with desired configuration.\n",
+    "\n",
+    "- `ex21_training_data`: Dataset containing labeled data produced in label-studio to train `layoutlm`\n",
+    "- `ex21_validation_set`: Labeled validation data describing expected inference output on validation filings\n",
+    "- `ex21_failed_parsing_metadata`: Metadata for any validation filings that couldn't be parsed (usually empty)\n",
+    "- `ex21_inference_dataset`: Parsed validation filings prepped for inference model"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -27,9 +49,17 @@
    "source": [
     "import dagstermill\n",
     "\n",
+    "from mozilla_sec_eia.models.sec10k import defs\n",
+    "\n",
     "context = dagstermill.get_context(op_config={\n",
-    "    \"uri\": None,\n",
-    "})"
+    "    \"layoutlm_uri\": None,\n",
+    "})\n",
+    "\n",
+    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n",
+    "\n",
+    "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
+    "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
+    "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")"
    ]
   },
   {
@@ -106,43 +136,6 @@
     "    }"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "39f0cbeb-7895-46bd-97d1-2c74e5265e12",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Load training data asset\n",
-    "\n",
-    "The following cell will load training data from a dagster asset. Using the dagster asset will allow easily caching the training data which can be computationally intensive to produce. When running this notebook in dagster directly, this cell will be replaced by dagster actually materializing the asset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "f8df608a-32b7-4795-a670-63a2e8772910",
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-03 17:47:13 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n"
-     ]
-    }
-   ],
-   "source": [
-    "from mozilla_sec_eia.models.sec10k import defs\n",
-    "\n",
-    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "8160263c-8f69-437c-918b-e56ad007961a",
@@ -695,22 +688,6 @@
     "Next, load an inference dataset containing validation data. This dataset is formatted exactly the same as those that will feed into the `Ex21Extractor` during a production run, but contain only data from the validation set. When creating inference datasets we also produce a metadata dataframe documenting any filings that couldn't be parsed/converted to a PDF. This dataframe should be empty for the validation set, but we will still load it for consistency with production runs."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "47c19b41-131f-4059-8f42-931237565a20",
-   "metadata": {
-    "tags": [
-     "parameters"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
-    "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
-    "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",

From d5e013aaeba7080bf347155cc57f1cdd15a6d18f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 12:52:33 -0400
Subject: [PATCH 087/161] Fix import in notebook

---
 .../models/sec10k/notebooks/exhibit21_extractor.ipynb         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index ca3ed24..e350119 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -424,7 +424,9 @@
     "from transformers import Pipeline, pipeline\n",
     "from transformers.tokenization_utils_base import BatchEncoding\n",
     "\n",
-    "from mozilla_sec_eia.models.sec10k.inference import get_flattened_mode_predictions\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n",
+    "    get_flattened_mode_predictions,\n",
+    ")\n",
     "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n",
     "    iob_to_label,\n",
     ")\n",

From f9810db4f777a6550181934c8f41c72f8a4fe092 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 12:57:04 -0400
Subject: [PATCH 088/161] add device to pipeline

---
 .../models/sec10k/notebooks/exhibit21_extractor.ipynb            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index e350119..fa04db6 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -641,6 +641,7 @@
     "            model=self.model_components[\"model\"],\n",
     "            tokenizer=self.model_components[\"tokenizer\"],\n",
     "            pipeline_class=LayoutLMInferencePipeline,\n",
+    "            device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"),\n",
     "        )\n",
     "\n",
     "        logits = []\n",

From 27608819539713a913f55788a64e81ca3daaedf5 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 13:07:53 -0400
Subject: [PATCH 089/161] Fix signature inference

---
 .../notebooks/exhibit21_extractor.ipynb       | 118 +-----------------
 1 file changed, 5 insertions(+), 113 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index fa04db6..d3419ad 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
    "metadata": {
     "tags": [
@@ -85,7 +85,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
    "metadata": {
     "tags": []
@@ -156,120 +156,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fafaf3dc8cfe431b90802b61bfe0acc6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Map:   0%|          | 0/159 [00:00<?, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_270331/790868001.py:94: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n",
-      "  metric = load_metric(\"seqeval\")\n",
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/training_args.py:1525: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
-      "  warnings.warn(\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "max_steps is given, it will override any value given in num_train_epochs\n",
-      "2024/10/03 17:52:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div>\n",
-       "      \n",
-       "      <progress value='6' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-       "      [   6/1000 00:02 < 10:34, 1.57 it/s, Epoch 0.04/8]\n",
-       "    </div>\n",
-       "    <table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       " <tr style=\"text-align: left;\">\n",
-       "      <th>Step</th>\n",
-       "      <th>Training Loss</th>\n",
-       "      <th>Validation Loss</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "  </tbody>\n",
-       "</table><p>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run orderly-mare-33 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/a94ac72df36447a489d576ea06a71a4a.\n",
-      "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
-      "2024/10/03 17:52:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/10/03 17:52:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
-     ]
-    },
-    {
-     "ename": "OutOfMemoryError",
-     "evalue": "CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m    106\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m    107\u001b[0m     model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m    108\u001b[0m     args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    113\u001b[0m     compute_metrics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m p: compute_metrics(p, metric\u001b[38;5;241m=\u001b[39mmetric, label_list\u001b[38;5;241m=\u001b[39mLABELS),\n\u001b[1;32m    114\u001b[0m )\n\u001b[1;32m    116\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run() \u001b[38;5;28;01mas\u001b[39;00m training_run:\n\u001b[1;32m    117\u001b[0m     \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m     \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m    121\u001b[0m     model \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mtokenizer}\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1936\u001b[0m         hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m   1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1939\u001b[0m \u001b[43m        \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1940\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1941\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1942\u001b[0m \u001b[43m        \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1943\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2341\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   2338\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2339\u001b[0m         grad_norm \u001b[38;5;241m=\u001b[39m _grad_norm\n\u001b[0;32m-> 2341\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2343\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_optimizer_step(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   2345\u001b[0m optimizer_was_run \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39moptimizer_step_was_skipped\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/accelerate/optimizer.py:172\u001b[0m, in \u001b[0;36mAcceleratedOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    170\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accelerate_step_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m    171\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 172\u001b[0m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator_state\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mXLA:\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_state\u001b[38;5;241m.\u001b[39mis_xla_gradients_synced \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:130\u001b[0m, in \u001b[0;36mLRScheduler.__init__.<locals>.patch_track_step_called.<locals>.wrap_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    128\u001b[0m opt \u001b[38;5;241m=\u001b[39m opt_ref()\n\u001b[1;32m    129\u001b[0m opt\u001b[38;5;241m.\u001b[39m_opt_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m  \u001b[38;5;66;03m# type: ignore[union-attr]\u001b[39;00m\n\u001b[0;32m--> 130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__get__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mopt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:484\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    479\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    480\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m    481\u001b[0m                 \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    482\u001b[0m             )\n\u001b[0;32m--> 484\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    485\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m    487\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:89\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.<locals>._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     87\u001b[0m     torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m     88\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 89\u001b[0m     ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     90\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m     91\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:227\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    214\u001b[0m     beta1, beta2 \u001b[38;5;241m=\u001b[39m cast(Tuple[\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mfloat\u001b[39m], group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m    216\u001b[0m     has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m    217\u001b[0m         group,\n\u001b[1;32m    218\u001b[0m         params_with_grad,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    224\u001b[0m         state_steps,\n\u001b[1;32m    225\u001b[0m     )\n\u001b[0;32m--> 227\u001b[0m     \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    228\u001b[0m \u001b[43m        \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    229\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    230\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    231\u001b[0m \u001b[43m        \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    232\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    233\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    234\u001b[0m \u001b[43m        \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    235\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    237\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    238\u001b[0m \u001b[43m        \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    239\u001b[0m \u001b[43m        \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    240\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    241\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    242\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    243\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    244\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    245\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    246\u001b[0m \u001b[43m        \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    247\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    248\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    250\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:161\u001b[0m, in \u001b[0;36m_disable_dynamo_if_unsupported.<locals>.wrapper.<locals>.maybe_fallback\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    159\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 161\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:767\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m    764\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    765\u001b[0m     func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 767\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    768\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    769\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    770\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    771\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    772\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    773\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    774\u001b[0m \u001b[43m    \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    775\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    776\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    777\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    778\u001b[0m \u001b[43m    \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    779\u001b[0m \u001b[43m    \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    780\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    781\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    782\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    783\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    784\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    785\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    786\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:600\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m    598\u001b[0m     exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39m_foreach_sqrt(device_max_exp_avg_sqs)\n\u001b[1;32m    599\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m     exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_sqrt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    602\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)\n\u001b[1;32m    603\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_add_(exp_avg_sq_sqrt, eps)\n",
-      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import mlflow\n",
     "from datasets import (\n",
@@ -806,7 +698,7 @@
     "        \"exhibit21_extractor\",\n",
     "        python_model=Ex21Extractor(),\n",
     "        artifacts={\"model_components\": model_uri},\n",
-    "        signature=infer_signature(dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
+    "        signature=infer_signature(ex21_inference_dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
     "    )"
    ]
   }

From 1dcacfaf9a217b4b102ba1ed23da603dfb9c08bd Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 13:21:07 -0400
Subject: [PATCH 090/161] Fix notebook dagster config

---
 src/mozilla_sec_eia/models/sec10k/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index da82dc0..1680f4a 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -53,8 +53,9 @@
 class TrainConfig(Config):
     """Config for training notebook."""
 
-    uri: str | None = None
-    # "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor"
+    layoutlm_uri: str | None = (
+        "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor"
+    )
     training_set: str = "labeledv0.2"
 
 

From 39bb45bdd2622c0ab27237ce7d5690090c58892a Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 4 Oct 2024 13:24:00 -0400
Subject: [PATCH 091/161] Fix config param name

---
 .../models/sec10k/notebooks/exhibit21_extractor.ipynb         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index d3419ad..3183831 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -224,7 +224,7 @@
     "\n",
     "# Only finetune if configured to do so\n",
     "training_run_id = None\n",
-    "if context.op_config[\"uri\"] is None:\n",
+    "if context.op_config[\"layoutlm_uri\"] is None:\n",
     "    id2label, label2id = get_id_label_conversions(LABELS)\n",
     "    # Change temp_dir to save training data locally for inspection\n",
     "    # Cache/prepare training data\n",
@@ -500,7 +500,7 @@
     "if training_run_id is not None:\n",
     "    model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n",
     "else:\n",
-    "    model_uri = context.op_config[\"uri\"]\n",
+    "    model_uri = context.op_config[\"layoutlm_uri\"]\n",
     "\n",
     "model_info = mlflow.models.get_model_info(model_uri)\n",
     "\n",

From cb83862235cfbbd1cfe40a2b78bcd636a5882efe Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sat, 5 Oct 2024 10:03:41 -0400
Subject: [PATCH 092/161] Partition training data

---
 .../models/sec10k/ex_21/data/__init__.py      | 24 ++++++++------
 .../notebooks/exhibit21_extractor.ipynb       | 24 +++++++++++---
 .../exhibit21_layout_classifier.ipynb         | 33 +++++++++++++++++++
 3 files changed, 66 insertions(+), 15 deletions(-)
 create mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index da5525f..2d5eff1 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -4,7 +4,13 @@
 from tempfile import TemporaryDirectory
 
 import pandas as pd
-from dagster import AssetOut, Config, asset, multi_asset
+from dagster import (
+    AssetExecutionContext,
+    AssetOut,
+    StaticPartitionsDefinition,
+    asset,
+    multi_asset,
+)
 
 from mozilla_sec_eia.library import validation_helpers
 
@@ -15,20 +21,18 @@
 from .training import format_as_ner_annotations
 
 
-class Ex21TrainingConfig(Config):
-    """Configure asset to produce ex21 training data."""
-
-    training_set: str = "labeledv0.2"
-
-
-@asset
-def ex21_training_data(config: Ex21TrainingConfig):
+@asset(
+    partitions_def=StaticPartitionsDefinition(
+        ["labeledv0.0", "labeledv0.1", "labeledv0.2"]
+    )
+)
+def ex21_training_data(context: AssetExecutionContext):
     """Construct training dataset for ex 21 extraction."""
     with TemporaryDirectory() as temp_dir:
         ner_annotations = format_as_ner_annotations(
             labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons",
             pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs",
-            gcs_folder_name=config.training_set,
+            gcs_folder_name=context.partition_key,
         )
     return ner_annotations
 
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 3183831..cc92a1e 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -55,7 +55,7 @@
     "    \"layoutlm_uri\": None,\n",
     "})\n",
     "\n",
-    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n",
+    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n",
     "\n",
     "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
     "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
@@ -607,6 +607,13 @@
     "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
     "    shared_cols = validation_df.columns.intersection(computed_df.columns)\n",
     "    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n",
+    "    # strip llc and other company name parts for the similarity comparison\n",
+    "    computed_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n",
+    "        computed_df[\"subsidiary\"]\n",
+    "    )\n",
+    "    validation_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n",
+    "        validation_df[\"subsidiary\"]\n",
+    "    )\n",
     "    n_equal = 0\n",
     "    validation_filenames = validation_df[\"id\"].unique()\n",
     "    n_files = len(validation_filenames)\n",
@@ -622,15 +629,22 @@
     "            validation_df[\"id\"] == filename\n",
     "        ].reset_index(drop=True)\n",
     "        # check if the tables are exactly equal\n",
-    "        if extracted_table_df.equals(validation_table_df):\n",
-    "            # TODO: strip llc and other company strings before comparison\n",
+    "        if extracted_table_df[[\"subsidiary\", \"loc\", \"own_per\"]].equals(\n",
+    "            validation_table_df[[\"subsidiary\", \"loc\", \"own_per\"]]\n",
+    "        ):\n",
     "            n_equal += 1\n",
     "        else:\n",
     "            incorrect_files.append(filename)\n",
-    "        # compute precision and recall for each column\n",
+    "        # compute jaccard sim + precision and recall for each column\n",
     "        table_metrics_dict[filename] = {}\n",
     "        jaccard_dict[filename] = {}\n",
     "        for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
+    "            extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n",
+    "                extracted_table_df[col]\n",
+    "            )\n",
+    "            validation_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n",
+    "                validation_table_df[col]\n",
+    "            )\n",
     "            table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n",
     "                extracted_table_df, validation_table_df, value_col=col\n",
     "            )\n",
@@ -669,7 +683,7 @@
     "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
     "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
     "        },\n",
-    "    )"
+    "    )\n"
    ]
   },
   {
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
new file mode 100644
index 0000000..1781454
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80cda90e-c2cb-4b71-b10d-cb23d7b51b3f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From c71593c355866d74fc25b90dd72dfeaa8c43c62f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sat, 5 Oct 2024 10:27:23 -0400
Subject: [PATCH 093/161] Add partitions to notebook asset

---
 src/mozilla_sec_eia/models/sec10k/__init__.py          |  2 +-
 .../models/sec10k/ex_21/data/__init__.py               | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 1680f4a..9bb3557 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -56,7 +56,6 @@ class TrainConfig(Config):
     layoutlm_uri: str | None = (
         "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor"
     )
-    training_set: str = "labeledv0.2"
 
 
 exhibit21_extractor = define_dagstermill_asset(
@@ -70,6 +69,7 @@ class TrainConfig(Config):
         "ex21_inference_dataset": AssetIn(),
     },
     save_notebook_on_failure=True,
+    partitions_def=ex_21.data.TRAINING_DATA_VERSION_PARTS,
 )
 ex21_training_job = define_asset_job(
     "ex21_training",
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index 2d5eff1..06860f1 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -20,12 +20,12 @@
 from .inference import create_inference_dataset
 from .training import format_as_ner_annotations
 
-
-@asset(
-    partitions_def=StaticPartitionsDefinition(
-        ["labeledv0.0", "labeledv0.1", "labeledv0.2"]
-    )
+TRAINING_DATA_VERSION_PARTS = StaticPartitionsDefinition(
+    ["labeledv0.0", "labeledv0.1", "labeledv0.2"]
 )
+
+
+@asset(partitions_def=TRAINING_DATA_VERSION_PARTS)
 def ex21_training_data(context: AssetExecutionContext):
     """Construct training dataset for ex 21 extraction."""
     with TemporaryDirectory() as temp_dir:

From 4efa5152eca11edc79228a4f0a04a8c2a3c149f8 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sun, 6 Oct 2024 09:24:51 -0400
Subject: [PATCH 094/161] Update ex21 labels

---
 .../validation_data/ex21_labels.csv           | 1612 +++++++++++++----
 1 file changed, 1275 insertions(+), 337 deletions(-)

diff --git a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
index 3d51f4f..006f344 100644
--- a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
+++ b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
@@ -140,7 +140,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 61339-0001161728-17-000004,"State Energy Services, LLC",,
 107815-0000107815-17-000106,"ATC Management, Inc.",Wisconsin,26.24
 107815-0000107815-17-000106,American Transmission Company LLC,Wisconsin,23.04
-107815-0000107815-17-000106,Bostco LLC,Wisconsin,100
+107815-0000107815-17-000106,Bostco LLC,Wisconsin,100.0
 1317577-0001193125-13-356794,"Elemental Energy, Inc.",Arizona,
 1317577-0001193125-13-356794,Klondyke Construction LLC,Arizona,
 1317577-0001193125-13-356794,"Pike Electric, LLC",North Carolina,
@@ -150,61 +150,61 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 1317577-0001193125-13-356794,"Pine Valley Power, Inc.",Utah,
 1317577-0001193125-13-356794,"Synergetic Design Holdings, Inc.",Delaware,
 1317577-0001193125-13-356794,"UC Synergetic, Inc.",South Carolina,
-40545-0000040545-04-000013,"AMERICAN SILICONES, INC.",Indiana,100
-40545-0000040545-04-000013,"BENTLY NEVADA, LLC",Delaware,100
-40545-0000040545-04-000013,CARIBE GE INTERNATIONAL ELECTRIC METERS CORP,Puerto Rico,100
-40545-0000040545-04-000013,"CARDINAL COGEN, INC.",Delaware,100
-40545-0000040545-04-000013,"DATEX-OHMEDA, INC.",Delaware,100
-40545-0000040545-04-000013,ELANO CORPORATION,Ohio,100
-40545-0000040545-04-000013,"GEAE TECHNOLOGY, INC.",Delaware,100
-40545-0000040545-04-000013,GE CGR EUROPE,France,100
-40545-0000040545-04-000013,"GE DRIVES and CONTROLS, INC.",Delaware,100
-40545-0000040545-04-000013,GE DRUCK HOLDINGS LIMITED,Delaware,100
-40545-0000040545-04-000013,"GE ELECTRIC CANADA, INC.",Canada,100
-40545-0000040545-04-000013,"GE ENERGY EUROPE, BV",Netherlands,100
-40545-0000040545-04-000013,GE ENERGY PARTS INC.,Delaware,100
-40545-0000040545-04-000013,"GE ENERGY PRODUCTS, INC.",Delaware,100
-40545-0000040545-04-000013,"GE ENERGY SERVICES, INC.",Delaware,100
-40545-0000040545-04-000013,"GE ENERGY SERVICES-DALLAS, LP",Delaware,100
-40545-0000040545-04-000013,"GE ENGINE SERVICES DISTRIBUTION, LLC.",Delaware,100
-40545-0000040545-04-000013,"GE ENGINE SERVICES, INC.",Delaware,100
-40545-0000040545-04-000013,GE FANUC AUTOMATION CORPORATION,Delaware,50
-40545-0000040545-04-000013,GE GAS TURBINES (GREENVILLE) L.L.C,Delaware,100
-40545-0000040545-04-000013,"GE HUNGARY CO., LTD",Hungary,100
-40545-0000040545-04-000013,"GE INTERLOGIX, INC.",Delaware,100
-40545-0000040545-04-000013,"GE INVESTMENT, INC.",Nevada,100
-40545-0000040545-04-000013,"GE KEPPEL ENERGY SERVICES PTE, INC.",Singapore,100
-40545-0000040545-04-000013,"GE MEDICAL GLOBAL TECHNOLOGY CO., LLC",Delaware,100
-40545-0000040545-04-000013,"GE MEDICAL SYSTEMS INFORMATION TECHNOLOGIES, INC.",Wisconsin,100
-40545-0000040545-04-000013,"GE MEDICAL SYSTEMS, INC.",Delaware,100
-40545-0000040545-04-000013,GE PACKAGED POWER L.P.,Delaware,100
-40545-0000040545-04-000013,"GE PETROCHEMICALS, INC.",Delaware,100
-40545-0000040545-04-000013,"GE PLASTIC FINISHING, INC.",Delaware,100
-40545-0000040545-04-000013,GE PLASTICS ESPANA ScPA,"Spain & Canary Islands, Balearic Island",100
-40545-0000040545-04-000013,GE PLASTICS PACIFIC PTE. LTD,Singapore,100
-40545-0000040545-04-000013,"GE POLYMERLAND, INC",Delaware,100
-40545-0000040545-04-000013,GE POWER SYSTEMS LICENSING INC,Delaware,100
-40545-0000040545-04-000013,"GE QUARTZ, INC.",Delaware,100
-40545-0000040545-04-000013,"GE SILICONES WV, LLC",West Virginia,100
-40545-0000040545-04-000013,"GE SUPERABRASIVES, INC.",Delaware,100
-40545-0000040545-04-000013,"GE TRANSPORTATION PARTS, LLC",Delaware,100
-40545-0000040545-04-000013,"GE TRANSPORTATION SERVICES, LLC.",Delaware,100
-40545-0000040545-04-000013,"GE TRANSPORTATION SYSTEMS GLOBAL SIGNALING, LLC.",Delaware,100
-40545-0000040545-04-000013,GEA PRODUCTS LP,Delaware,100
-40545-0000040545-04-000013,GENERAL ELECTRIC INTERNATIONAL (BENELUX) BV,Netherlands,100
-40545-0000040545-04-000013,"GENERAL ELECTRIC INTERNATIONAL, INC.",Delaware,100
-40545-0000040545-04-000013,"GRANITE SERVICES, INC.",Delaware,100
-40545-0000040545-04-000013,NATIONAL BROADCASTING COMPANY (NBC),Delaware,100
-40545-0000040545-04-000013,"NUCLEAR FUEL HOLDING CO.,INC",Delaware,100
-40545-0000040545-04-000013,NUOVO PIGNONE HOLDING S.P.A,Italy,100
-40545-0000040545-04-000013,OEC MEDICAL SYSTEMS INC,Delaware,100
-40545-0000040545-04-000013,PII LIMITED,United Kingdom & Northern Ireland,100
-40545-0000040545-04-000013,"REUTER-STOKES, INC.",Delaware,100
-40545-0000040545-04-000013,"SENSING SOLUTIONS, INC.",Delaware,100
-40545-0000040545-04-000013,"VICEROY, INC.",Delaware,100
-40545-0000040545-04-000013,"GENERAL ELECTRIC CAPITAL SERVICES, INC.",Delaware,100
-40545-0000040545-04-000013,General Electric Capital Corporation,New York,100
-40545-0000040545-04-000013,GE Global Insurance Holding Corporation,Missouri,100
+40545-0000040545-04-000013,"AMERICAN SILICONES, INC.",Indiana,100.0
+40545-0000040545-04-000013,"BENTLY NEVADA, LLC",Delaware,100.0
+40545-0000040545-04-000013,CARIBE GE INTERNATIONAL ELECTRIC METERS CORP,Puerto Rico,100.0
+40545-0000040545-04-000013,"CARDINAL COGEN, INC.",Delaware,100.0
+40545-0000040545-04-000013,"DATEX-OHMEDA, INC.",Delaware,100.0
+40545-0000040545-04-000013,ELANO CORPORATION,Ohio,100.0
+40545-0000040545-04-000013,"GEAE TECHNOLOGY, INC.",Delaware,100.0
+40545-0000040545-04-000013,GE CGR EUROPE,France,100.0
+40545-0000040545-04-000013,"GE DRIVES and CONTROLS, INC.",Delaware,100.0
+40545-0000040545-04-000013,GE DRUCK HOLDINGS LIMITED,Delaware,100.0
+40545-0000040545-04-000013,"GE ELECTRIC CANADA, INC.",Canada,100.0
+40545-0000040545-04-000013,"GE ENERGY EUROPE, BV",Netherlands,100.0
+40545-0000040545-04-000013,GE ENERGY PARTS INC.,Delaware,100.0
+40545-0000040545-04-000013,"GE ENERGY PRODUCTS, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE ENERGY SERVICES, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE ENERGY SERVICES-DALLAS, LP",Delaware,100.0
+40545-0000040545-04-000013,"GE ENGINE SERVICES DISTRIBUTION, LLC.",Delaware,100.0
+40545-0000040545-04-000013,"GE ENGINE SERVICES, INC.",Delaware,100.0
+40545-0000040545-04-000013,GE FANUC AUTOMATION CORPORATION,Delaware,50.0
+40545-0000040545-04-000013,GE GAS TURBINES (GREENVILLE) L.L.C,Delaware,100.0
+40545-0000040545-04-000013,"GE HUNGARY CO., LTD",Hungary,100.0
+40545-0000040545-04-000013,"GE INTERLOGIX, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE INVESTMENT, INC.",Nevada,100.0
+40545-0000040545-04-000013,"GE KEPPEL ENERGY SERVICES PTE, INC.",Singapore,100.0
+40545-0000040545-04-000013,"GE MEDICAL GLOBAL TECHNOLOGY CO., LLC",Delaware,100.0
+40545-0000040545-04-000013,"GE MEDICAL SYSTEMS INFORMATION TECHNOLOGIES, INC.",Wisconsin,100.0
+40545-0000040545-04-000013,"GE MEDICAL SYSTEMS, INC.",Delaware,100.0
+40545-0000040545-04-000013,GE PACKAGED POWER L.P.,Delaware,100.0
+40545-0000040545-04-000013,"GE PETROCHEMICALS, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE PLASTIC FINISHING, INC.",Delaware,100.0
+40545-0000040545-04-000013,GE PLASTICS ESPANA ScPA,"Spain & Canary Islands, Balearic Island",100.0
+40545-0000040545-04-000013,GE PLASTICS PACIFIC PTE. LTD,Singapore,100.0
+40545-0000040545-04-000013,"GE POLYMERLAND, INC",Delaware,100.0
+40545-0000040545-04-000013,GE POWER SYSTEMS LICENSING INC,Delaware,100.0
+40545-0000040545-04-000013,"GE QUARTZ, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE SILICONES WV, LLC",West Virginia,100.0
+40545-0000040545-04-000013,"GE SUPERABRASIVES, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GE TRANSPORTATION PARTS, LLC",Delaware,100.0
+40545-0000040545-04-000013,"GE TRANSPORTATION SERVICES, LLC.",Delaware,100.0
+40545-0000040545-04-000013,"GE TRANSPORTATION SYSTEMS GLOBAL SIGNALING, LLC.",Delaware,100.0
+40545-0000040545-04-000013,GEA PRODUCTS LP,Delaware,100.0
+40545-0000040545-04-000013,GENERAL ELECTRIC INTERNATIONAL (BENELUX) BV,Netherlands,100.0
+40545-0000040545-04-000013,"GENERAL ELECTRIC INTERNATIONAL, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GRANITE SERVICES, INC.",Delaware,100.0
+40545-0000040545-04-000013,NATIONAL BROADCASTING COMPANY (NBC),Delaware,100.0
+40545-0000040545-04-000013,"NUCLEAR FUEL HOLDING CO.,INC",Delaware,100.0
+40545-0000040545-04-000013,NUOVO PIGNONE HOLDING S.P.A,Italy,100.0
+40545-0000040545-04-000013,OEC MEDICAL SYSTEMS INC,Delaware,100.0
+40545-0000040545-04-000013,PII LIMITED,United Kingdom & Northern Ireland,100.0
+40545-0000040545-04-000013,"REUTER-STOKES, INC.",Delaware,100.0
+40545-0000040545-04-000013,"SENSING SOLUTIONS, INC.",Delaware,100.0
+40545-0000040545-04-000013,"VICEROY, INC.",Delaware,100.0
+40545-0000040545-04-000013,"GENERAL ELECTRIC CAPITAL SERVICES, INC.",Delaware,100.0
+40545-0000040545-04-000013,General Electric Capital Corporation,New York,100.0
+40545-0000040545-04-000013,GE Global Insurance Holding Corporation,Missouri,100.0
 39547-0001047469-03-024149,"Turtle Shell, Inc. (f/k/a Snapper, Inc.)","Georgia, USA",
 39547-0001047469-03-024149,Actava Financial Ltd.,Delaware,
 39547-0001047469-03-024149,"Actava SHL, Inc.",Delaware,
@@ -271,11 +271,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 49728-0001144204-11-070058,IEC Electronics Corp.-Albuquerque,New Mexico,
 49728-0001144204-11-070058,"Dynamic Research and Testing Laboratories, LLC",New Mexico,
 49728-0001144204-11-070058,"Southern California Braiding, Inc.",Delaware,
-200155-0000021267-99-000027,"CIG Exploration, Inc",Delaware,
-200155-0000021267-99-000027,CIG Field Services Company,Delaware,
-200155-0000021267-99-000027,"Great Divide Gas Services, LLC",Colorado,73
-200155-0000021267-99-000027,Colorado Water Supply Company,Delaware,
-200155-0000021267-99-000027,Colorado Interstate Production Company,Delaware,
 315858-0000315858-19-000023,"Woodbridge Holdings, LLC",Florida,
 315858-0000315858-19-000023,"BBX Capital Florida, LLC",Florida,
 315858-0000315858-19-000023,"Eden Services, Inc.",Florida,
@@ -382,7 +377,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 315858-0000315858-19-000023,"Hialeah Multifamily, LLC",Florida,
 315858-0000315858-19-000023,"BBX Residential Victoria Park, LLC",Florida,
 315858-0000315858-19-000023,"Premier Flagler, LLC",Florida,
-315858-0000315858-19-000023,Banc Servicing Center LLC,Florida,
+315858-0000315858-19-000023,"Banc Servicing Center, LLC",Florida,
 315858-0000315858-19-000023,"Fidelity Service, LLC",Florida,
 315858-0000315858-19-000023,"Fidelity Tax, LLC",Florida,
 315858-0000315858-19-000023,"Heartwood 3, LLC",Florida,
@@ -509,11 +504,11 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 718877-0001047469-08-007085,Treyarch Corporation,Delaware,
 718877-0001047469-08-007085,"Toys For Bob, Inc.",California,
 718877-0001047469-08-007085,"Vicarious Visions, Inc.",New York,
-811669-0000950123-05-002610,International Wine & Spirits Ltd.,Delaware,100
-811669-0000950123-05-002610,Ste. Michelle Wine Estates Ltd.,Washington,100
-811669-0000950123-05-002610,U.S. Smokeless Tobacco Company,Delaware,100
-811669-0000950123-05-002610,U.S. Smokeless Tobacco Manufacturing Limited Partnership,Delaware,100
-811669-0000950123-05-002610,U.S. Smokeless Tobacco Brands Inc.,Delaware,100
+811669-0000950123-05-002610,International Wine & Spirits Ltd.,Delaware,100.0
+811669-0000950123-05-002610,Ste. Michelle Wine Estates Ltd.,Washington,100.0
+811669-0000950123-05-002610,U.S. Smokeless Tobacco Company,Delaware,100.0
+811669-0000950123-05-002610,U.S. Smokeless Tobacco Manufacturing Limited Partnership,Delaware,100.0
+811669-0000950123-05-002610,U.S. Smokeless Tobacco Brands Inc.,Delaware,100.0
 857501-0001065949-17-000087,Jacobs & Company,West Virginia,
 857501-0001065949-17-000087,"FS Investments, Inc.",West Virginia,
 857501-0001065949-17-000087,"Triangle Surety Agency, Inc.",West Virginia,
@@ -626,22 +621,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 908255-0000908255-13-000006,BorgWarner Turbo and Emissions Systems de Mexico S.A. de C.V.,,
 908255-0000908255-13-000006,BorgWarner (Thailand) Limited,,
 908255-0000908255-13-000006,"BorgWarner (China) Research & Development Co., Ltd.",,
-913614-0000930661-01-502777,Bactolac Pharmaceutical Inc.,Delaware,
-913614-0000930661-01-502777,"ANI Pharmaceuticals, Inc.",Mississippi,
-913614-0000930661-01-502777,NL Acquisition Company,Delaware,
-923472-0000892569-97-000821,Samantha Hotel Corporation,Delaware,
-923472-0000892569-97-000821,"RFS, Inc.",Tennessee,
-923472-0000892569-97-000821,Doubletree Partners,Delaware,
-923472-0000892569-97-000821,Doubletree Hotels Corporation,Arizona,
-923472-0000892569-97-000821,"Doubletree of Phoenix, Inc.",Delaware,
-923472-0000892569-97-000821,INNCO Corporation,Arizona,
-923472-0000892569-97-000821,HOSCO Corporation,Arizona,
-923472-0000892569-97-000821,"DT Management, Inc.",Arizona,
-923472-0000892569-97-000821,"DT Real Estate, Inc.",Arizona,
-923472-0000892569-97-000821,"Doubletree Hotel Systems, Inc.",Arizona,
-923472-0000892569-97-000821,Harbor Hotels Corporation,Delaware,
-923472-0000892569-97-000821,"DTM Burlingame, Inc.",Arizona,
-923472-0000892569-97-000821,"Red Lion Hotels, Inc.",Delaware,
 1484565-0001564590-20-008705,Soleno Therapeutics UK Ltd.,United Kingdom,
 1484565-0001564590-20-008705,Soleno Therapeutics Europe Ltd.,Ireland,
 1484565-0001564590-20-008705,"Essentialis, Inc.",Delaware,
@@ -715,9 +694,9 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 354707-0000354707-19-000043,"Hawaii Electric Light Company, Inc.",Hawaii,
 354707-0000354707-19-000043,"Renewable Hawaii, Inc.",Hawaii,
 354707-0000354707-19-000043,Uluwehiokama Biofuels Corp.,Hawaii,
-354707-0000354707-19-000043,HECO Capital Trust III,Delaware,
+354707-0000354707-19-000043,HECO Capital Trust III (a statutory trust),Delaware,
 354707-0000354707-19-000043,"ASB Hawaii, Inc.",Hawaii,
-354707-0000354707-19-000043,"American Savings Bank, F.S.B.",,
+354707-0000354707-19-000043,"American Savings Bank, F.S.B.",federally chartered,
 354707-0000354707-19-000043,"The Old Oahu Tug Service, Inc. ",Hawaii,
 354707-0000354707-19-000043,"Pacific Current, LLC",Hawaii,
 354707-0000354707-19-000043,"Hamakua Holdings, LLC",Hawaii,
@@ -739,7 +718,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 84557-0001046861-06-000007,The Southern Connecticut Gas Company,Connecticut,
 100826-0001193125-09-042636,Ameren Corporation,Missouri,
 100826-0001193125-09-042636,Ameren Development Company,Missouri,
-100826-0001193125-09-042636,"Enporion, Inc.",Delaware,21
+100826-0001193125-09-042636,"Enporion, Inc.",Delaware,21.0
 100826-0001193125-09-042636,Missouri Central Railroad Company,Delaware,
 100826-0001193125-09-042636,CIPSCO Leasing Company,Illinois,
 100826-0001193125-09-042636,"Gateway Energy Systems, L.C.",Missouri,89.1
@@ -749,7 +728,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 100826-0001193125-09-042636,Coffeen and Western Railroad Company,Illinois,
 100826-0001193125-09-042636,Ameren Energy Marketing Company,Illinois,
 100826-0001193125-09-042636,Illinois Materials Supply Co.,Illinois,
-100826-0001193125-09-042636,"Electric Energy, Inc.",Illinois,80
+100826-0001193125-09-042636,"Electric Energy, Inc.",Illinois,80.0
 100826-0001193125-09-042636,Midwest Electric Power Inc.,Illinois,
 100826-0001193125-09-042636,Joppa and Eastern Railroad Company,Illinois,
 100826-0001193125-09-042636,"Met South, Inc.",Illinois,
@@ -765,76 +744,68 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 100826-0001193125-09-042636,CLC Aircraft Leasing LLC,Delaware,
 100826-0001193125-09-042636,QST Enterprises Inc.,Illinois,
 100826-0001193125-09-042636,ESE Land Corporation,Illinois,
-100826-0001193125-09-042636,California/Nevada Development L.L.C.,Delaware,15
+100826-0001193125-09-042636,California/Nevada Development L.L.C.,Delaware,15.0
 100826-0001193125-09-042636,Energy Risk Assurance Company,Vermont,
 100826-0001193125-09-042636,Missouri Energy Risk Assurance Company LLC,Missouri,
-100826-0001193125-09-042636,"Illinois Power Company, d/b/a AmerenIP",Illinois,
+100826-0001193125-09-042636,Illinois Power Company,Illinois,
 100826-0001193125-09-042636,Illinois Power Securitization Limited Liability Company,Delaware,
 100826-0001193125-09-042636,Illinois Power Special Purpose Trust,Delaware,
 100826-0001193125-09-042636,Union Electric Company,Missouri,
-100826-0001193125-09-042636,Fuelco LLC,Delaware,
-81033-0000950117-06-000927,Public Service Electric and Gas Company,New Jersey,100
-81033-0000950117-06-000927,PSEG Power LLC ,Delaware,100
-81033-0000950117-06-000927,PSEG Fossil LLC,Delaware,100
-81033-0000950117-06-000927,PSEG Energy Resources & Trade LLC,Delaware,100
-81033-0000950117-06-000927,PSEG Energy Holdings L.L.C. ,New Jersey,100
-81033-0000950117-06-000927,PSEG Resources L.L.C.,New Jersey,100
-81033-0000950117-06-000927,PSEG Global L.L.C.,New Jersey,100
-81033-0000950117-06-000927,PSEG Global International Holdings LLC,Delaware,100
-4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York,100
-4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100
-4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100
-4904-0000004904-09-000040,"AEP Coal, Inc.",Nevada,100
-4904-0000004904-09-000040,"AEP Communications, Inc.",Ohio,100
-4904-0000004904-09-000040,"AEP Credit, Inc.",Delaware,100
-4904-0000004904-09-000040,AEP Generating Company,Ohio,100
-4904-0000004904-09-000040,"AEP Investments, Inc.",Ohio,100
-4904-0000004904-09-000040,AEP Nonutility Funding LLC,Delaware,100
-4904-0000004904-09-000040,"AEP Power Marketing, Inc.",Ohio,100
-4904-0000004904-09-000040,"AEP Pro Serv, Inc.",Ohio,100
-4904-0000004904-09-000040,"AEP Resources, Inc.",Ohio,100
-4904-0000004904-09-000040,"AEP T&D Services, LLC",Delaware,100
-4904-0000004904-09-000040,"AEP Transmission Holding Company, LLC",Delaware,100
-4904-0000004904-09-000040,"AEP Utilities, Inc.",Delaware,100
-4904-0000004904-09-000040,AEP Texas Central Company,Texas,100
-4904-0000004904-09-000040,AEP Texas Central Transition Funding LLC,Delaware,100
-4904-0000004904-09-000040,AEP Texas Central Transition Funding II LLC,Delaware,100
-4904-0000004904-09-000040,AEP Texas North Company,Texas,100
-4904-0000004904-09-000040,AEP Texas North Generation Company LLC,Delaware,100
-4904-0000004904-09-000040,"CSW Energy, Inc.",Texas,100
-4904-0000004904-09-000040,"CSW Energy Services, Inc.",Delaware,100
-4904-0000004904-09-000040,"CSW International, Inc.",Delaware,100
-4904-0000004904-09-000040,"Electric Transmission Texas, LLC",Delaware,50
-4904-0000004904-09-000040,AEP Utility Funding LLC,Delaware,100
+100826-0001193125-09-042636,Fuelco LLC,Delaware,33.3
+4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York,
+4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100.0
+4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100.0
+4904-0000004904-09-000040,"AEP Coal, Inc.",Nevada,100.0
+4904-0000004904-09-000040,"AEP Communications, Inc.",Ohio,100.0
+4904-0000004904-09-000040,"AEP Credit, Inc.",Delaware,100.0
+4904-0000004904-09-000040,AEP Generating Company,Ohio,100.0
+4904-0000004904-09-000040,"AEP Investments, Inc.",Ohio,100.0
+4904-0000004904-09-000040,AEP Nonutility Funding LLC,Delaware,100.0
+4904-0000004904-09-000040,"AEP Power Marketing, Inc.",Ohio,100.0
+4904-0000004904-09-000040,"AEP Pro Serv, Inc.",Ohio,100.0
+4904-0000004904-09-000040,"AEP Resources, Inc.",Ohio,100.0
+4904-0000004904-09-000040,"AEP T&D Services, LLC",Delaware,100.0
+4904-0000004904-09-000040,"AEP Transmission Holding Company, LLC",Delaware,100.0
+4904-0000004904-09-000040,"AEP Utilities, Inc.",Delaware,100.0
+4904-0000004904-09-000040,AEP Texas Central Company,Texas,100.0
+4904-0000004904-09-000040,AEP Texas Central Transition Funding LLC,Delaware,100.0
+4904-0000004904-09-000040,AEP Texas Central Transition Funding II LLC,Delaware,100.0
+4904-0000004904-09-000040,AEP Texas North Company,Texas,100.0
+4904-0000004904-09-000040,AEP Texas North Generation Company LLC,Delaware,100.0
+4904-0000004904-09-000040,"CSW Energy, Inc.",Texas,100.0
+4904-0000004904-09-000040,"CSW Energy Services, Inc.",Delaware,100.0
+4904-0000004904-09-000040,"CSW International, Inc.",Delaware,100.0
+4904-0000004904-09-000040,"Electric Transmission Texas, LLC",Delaware,50.0
+4904-0000004904-09-000040,AEP Utility Funding LLC,Delaware,100.0
 4904-0000004904-09-000040,Appalachian Power Company,Virginia,98.7
-4904-0000004904-09-000040,Cedar Coal Co.,West Virginia,100
-4904-0000004904-09-000040,Central Appalachian Coal Company,West Virginia,100
-4904-0000004904-09-000040,Central Coal Company,West Virginia,50
-4904-0000004904-09-000040,Southern Appalachian Coal Company,West Virginia,100
-4904-0000004904-09-000040,Columbus Southern Power Company,Ohio,100
-4904-0000004904-09-000040,"Colomet, Inc.",Ohio,100
-4904-0000004904-09-000040,Conesville Coal Preparation Company ,Ohio,100
+4904-0000004904-09-000040,Cedar Coal Co.,West Virginia,100.0
+4904-0000004904-09-000040,Central Appalachian Coal Company,West Virginia,100.0
+4904-0000004904-09-000040,Central Coal Company,West Virginia,50.0
+4904-0000004904-09-000040,Southern Appalachian Coal Company,West Virginia,100.0
+4904-0000004904-09-000040,Columbus Southern Power Company,Ohio,100.0
+4904-0000004904-09-000040,"Colomet, Inc.",Ohio,100.0
+4904-0000004904-09-000040,Conesville Coal Preparation Company ,Ohio,100.0
 4904-0000004904-09-000040,Ohio Valley Electric Corporation,Ohio,4.3
-4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100
-4904-0000004904-09-000040,Franklin Real Estate Company,Pennsylvania,100
-4904-0000004904-09-000040,Indiana Michigan Power Company,Indiana,100
-4904-0000004904-09-000040,Blackhawk Coal Company,Utah,100
-4904-0000004904-09-000040,Price River Coal Company ,Indiana,100
-4904-0000004904-09-000040,Kentucky Power Company,Kentucky,100
-4904-0000004904-09-000040,Kingsport Power Company,Virginia,100
+4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100.0
+4904-0000004904-09-000040,Franklin Real Estate Company,Pennsylvania,100.0
+4904-0000004904-09-000040,Indiana Michigan Power Company,Indiana,100.0
+4904-0000004904-09-000040,Blackhawk Coal Company,Utah,100.0
+4904-0000004904-09-000040,Price River Coal Company ,Indiana,100.0
+4904-0000004904-09-000040,Kentucky Power Company,Kentucky,100.0
+4904-0000004904-09-000040,Kingsport Power Company,Virginia,100.0
 4904-0000004904-09-000040,Ohio Power Company ,Ohio,99.4
-4904-0000004904-09-000040,Cardinal Operating Company,Ohio,50
-4904-0000004904-09-000040,Central Coal Company,West Virginia,50
+4904-0000004904-09-000040,Cardinal Operating Company,Ohio,50.0
+4904-0000004904-09-000040,Central Coal Company,West Virginia,50.0
 4904-0000004904-09-000040,Ohio Valley Electric Corporation,Ohio,39.2
-4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100
+4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100.0
 4904-0000004904-09-000040,"Power Tree Carbon Company, LLC",Delaware,9.2
-4904-0000004904-09-000040,Public Service Company of Oklahoma,Oklahoma,100
-4904-0000004904-09-000040,Southwestern Electric Power Company,Delaware,100
-4904-0000004904-09-000040,"Dolet Hills Lignite Company, LLC ",Delaware,100
-4904-0000004904-09-000040,Southwestern Arkansas Utilities Corporation,Arkansas,100
-4904-0000004904-09-000040,SWEPCo Capital Trust I,Delaware,100
+4904-0000004904-09-000040,Public Service Company of Oklahoma,Oklahoma,100.0
+4904-0000004904-09-000040,Southwestern Electric Power Company,Delaware,100.0
+4904-0000004904-09-000040,"Dolet Hills Lignite Company, LLC ",Delaware,100.0
+4904-0000004904-09-000040,Southwestern Arkansas Utilities Corporation,Arkansas,100.0
+4904-0000004904-09-000040,SWEPCo Capital Trust I,Delaware,100.0
 4904-0000004904-09-000040,The Arklahoma Corporation,Arkansas,47.6
-4904-0000004904-09-000040,Wheeling Power Company,West Virginia,100
+4904-0000004904-09-000040,Wheeling Power Company,West Virginia,100.0
 46207-0001104659-13-011461,"Hawaiian Electric Company, Inc.",Hawaii,
 46207-0001104659-13-011461,"Maui Electric Company, Limited",Hawaii,
 46207-0001104659-13-011461,"Hawaii Electric Light Company, Inc.",Hawaii,
@@ -842,46 +813,11 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 46207-0001104659-13-011461,Uluwehiokama Biofuels Corp.,Hawaii,
 46207-0001104659-13-011461,HECO Capital Trust III,Delaware,
 46207-0001104659-13-011461,"American Savings Holdings, Inc.",Hawaii,
-46207-0001104659-13-011461,"American Savings Bank, F.S.B.",,
+46207-0001104659-13-011461,"American Savings Bank, F.S.B.",federally chartered,
 46207-0001104659-13-011461,"HEI Properties, Inc.",Hawaii,
 46207-0001104659-13-011461,Hawaiian Electric Industries Capital Trust II ,Delaware,
 46207-0001104659-13-011461,Hawaiian Electric Industries Capital Trust III,Delaware,
 46207-0001104659-13-011461,"The Old Oahu Tug Service, Inc.",Hawaii,
-205402-0000950114-99-000043,Graybar Foreign Sales Corporation,Barbados,
-205402-0000950114-99-000043,"Graybar International, Inc.",Missouri,
-205402-0000950114-99-000043,"Graybar Financial Services, Inc.",Missouri,
-205402-0000950114-99-000043,"Graybar Electric de Mexico, S. DE R.L. DE C.V.,",Mexican,
-205402-0000950114-99-000043,Graybar Electric Limited,Nova Scotia,
-205402-0000950114-99-000043,"Graybar Foundation, Inc",Missouri,
-205402-0000950114-99-000043,"Graybar Services, Inc.",Illinois,
-205402-0000950114-99-000043,"Distribution Associates, Inc.",Missouri,
-205402-0000950114-99-000043,Graybar Electric (Ontario) Limited,Ontario,
-205402-0000950114-99-000043,Graybar International PTE LTD,Singaporean,
-205402-0000950114-99-000043,"Graybar Business Services, Inc.",Missouri,
-205402-0000950114-99-000043,Graybar International de Chile Limitada,Chile,
-9342-0000009342-95-000008,"Baldor of Arkansas, Inc.",Arkansas,100
-9342-0000009342-95-000008,"Baldor of Nevada, Inc.",Nevada,100
-9342-0000009342-95-000008,BEC Business Trust,Massachusetts,100
-9342-0000009342-95-000008,"Baldor of Texas, L.P.",Texas,100
-9342-0000009342-95-000008,"Baldor International, Inc.",U.S. Virgin Islands,100
-9342-0000009342-95-000008,"Carolina Capacitors, Inc.",South Carolina,100
-9342-0000009342-95-000008,"Southwestern Die Casting Co., Inc.",Arkansas,100
-9342-0000009342-95-000008,"Sweo Controls, Inc.",Washington,100
-9342-0000009342-95-000008,"Baldor Holdings, Inc.",Delaware,100
-9342-0000009342-95-000008,"Baldor de Mexico, S.A. de C.V.",Mexico,100
-9342-0000009342-95-000008,"Baldor ASR, AG",Switzerland,100
-9342-0000009342-95-000008,Baldor ASR GmbH fuer Antriebstechnik,Germany,100
-9342-0000009342-95-000008,Baldor ASR U.K. Limited,United Kingdom,100
-9342-0000009342-95-000008,Australian Baldor Pty. Limited,Australia,60
-9342-0000009342-95-000008,Baldor Electric (Far East) PTE. Ltd.,Singapore,60
-9342-0000009342-95-000008,Baldor Electric (Thailand) Ltd.,Thailand,100
-9342-0000009342-95-000008,Baldor Industrial Automation PTE. Ltd.,Singapore,100
-9342-0000009342-95-000008,Baldor Electric (Indonesia) Ltd.,Indonesia,100
-9342-0000009342-95-000008,Baldor of Nevada,,
-9342-0000009342-95-000008,Baldor Business Trust (LP),,
-9342-0000009342-95-000008,Baldor of Arkansas (GP),,
-9342-0000009342-95-000008,"Baldor Holdings, Inc.",,
-9342-0000009342-95-000008,Baldor Electric (Far East) PTE. Ltd.,,
 9534-0000897069-05-000574,Bandag A.G,Switzerland,
 9534-0000897069-05-000574,Bandag Canada Ltd.,Canada,
 9534-0000897069-05-000574,Bandag Europe N.V,Belgium,
@@ -897,7 +833,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 18647-0001169232-08-000603,"Phoenix Development Company, Inc.",New York,
 18647-0001169232-08-000603,Central Hudson Enterprises Corporation,New York,
 18647-0001169232-08-000603,"Griffith Energy Services, Inc.",New York,
-20947-0001031296-06-000044,Ohio Edison Company – Incorporated in Ohio,Ohio,
+20947-0001031296-06-000044,Ohio Edison Company,Ohio,
 20947-0001031296-06-000044,The Cleveland Electric Illuminating Company,Ohio,
 20947-0001031296-06-000044,The Toledo Edison Company,Ohio,
 20947-0001031296-06-000044,Centerior Service Company,Ohio,
@@ -925,94 +861,54 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 20947-0001031296-06-000044,FirstEnergy Nuclear Generation Corp.,Ohio,
 34067-0001104659-06-016592,Nobelclad Europe S.A.,"Rivesaltes, France",
 34067-0001104659-06-016592,Nitro Metall Aktiebolag,"Likenas, Sweden",
-38725-0000038725-17-000042,Bombas Leao SA,Brazil,100
-38725-0000038725-17-000042,Cookson & Zinn (PTL) Limited ,United Kingdom,100
-38725-0000038725-17-000042,Coverco S.r.l.,Italy,100
-38725-0000038725-17-000042,FE Latin America B.V.,Netherlands,100
-38725-0000038725-17-000042,FELE C.V.,Netherlands,100
-38725-0000038725-17-000042,"Franklin Control Systems, Inc.",Oregon,100
-38725-0000038725-17-000042,Franklin Electric (Australia) Pty. Ltd.,Australia,100
-38725-0000038725-17-000042,Franklin Electric (Chile) Ltda,Chile,100
-38725-0000038725-17-000042,Franklin Electric (SEA) Pty. Ltd.,Singapore,100
-38725-0000038725-17-000042,Franklin Electric (South Africa) Pty. Ltd.,South Africa,100
-38725-0000038725-17-000042,"Franklin Electric (Suzhou) Co., Ltd.",China,100
-38725-0000038725-17-000042,Franklin Electric (Zambia) Ltd.,Zambia,100
-38725-0000038725-17-000042,Franklin Electric B.V.,Netherlands,100
-38725-0000038725-17-000042,Franklin Electric Botswana Pty. Ltd.,Botswana,100
-38725-0000038725-17-000042,"Franklin Electric Canada, Inc.",Canada,100
-38725-0000038725-17-000042,Franklin Electric Colombia SAS,Colombia,100
-38725-0000038725-17-000042,Franklin Electric Europa GmbH,Germany,100
-38725-0000038725-17-000042,Franklin Electric Germany Holding GmbH,Germany,100
-38725-0000038725-17-000042,Franklin Electric Holding B.V.,Netherlands,100
-38725-0000038725-17-000042,Franklin Electric India Private Ltd.,Indiana,100
-38725-0000038725-17-000042,Franklin Electric Industria de Motobombas SA,Brazil,100
-38725-0000038725-17-000042,"Franklin Electric International, Inc.",Delaware,100
-38725-0000038725-17-000042,Franklin Electric NL BV,Netherlands,100
-38725-0000038725-17-000042,Franklin Electric spol s.r.o.,Czech Republic,100
-38725-0000038725-17-000042,"Franklin Electric Subsidiaries, LLC",Indiana,100
-38725-0000038725-17-000042,"Franklin Electric Trading (Shanghai) Co., Ltd.",China,100
-38725-0000038725-17-000042,Franklin Fueling Sistemas de Combustiveis Ltda,Brazil,100
-38725-0000038725-17-000042,Franklin Fueling Systems (Beijing) Company Ltd.,China,100
-38725-0000038725-17-000042,Franklin Fueling Systems Australia Pty. Ltd.,Australia,100
-38725-0000038725-17-000042,Franklin Fueling Systems France SARL,France,100
-38725-0000038725-17-000042,Franklin Fueling Systems India Private Ltd.,India,100
-38725-0000038725-17-000042,Franklin Fueling Systems Ltd.,United Kingdom,100
-38725-0000038725-17-000042,Franklin Fueling Systems GmbH,Germany,100
-38725-0000038725-17-000042,"Franklin Fueling Systems, Inc.",Indiana,100
-38725-0000038725-17-000042,Impo Motor Pompa Sanayi ve Ticaret A.S.,Turkey,90
-38725-0000038725-17-000042,"Intelligent Controls, LLC",Maine,100
-38725-0000038725-17-000042,Motores Electricos Sumergibles de Mexico S. de R.L de C.V.,Mexico,100
-38725-0000038725-17-000042,Motores Franklin S.A. de C.V.,Mexico,100
-38725-0000038725-17-000042,Motori Sommersi Riavvolgibili S.r.l.,Italy,75
-38725-0000038725-17-000042,Pioneer Pump Holdings Pty.,Australia,100
-38725-0000038725-17-000042,Pioneer Pump Ltd.,United Kingdom,100
-38725-0000038725-17-000042,Pioneer Pump Pty. Ltd. ,South Africa,100
-38725-0000038725-17-000042,Pioneer Pump Solutions Ltd.,United Kingdom,100
-38725-0000038725-17-000042,"Pioneer Pump, Inc.",Texas,100
-38725-0000038725-17-000042,Pluga Pumps and Motors Private Limited,India,70
-38725-0000038725-17-000042,Servicios de MESMEX S de SRL de CV,Mexico,100
-38725-0000038725-17-000042,Franklin Electric S.r.l,Italy,100
-38725-0000038725-17-000042,Franklin Wadcorpp India Private Limited,India,65
-60549-0001047469-98-012481,Louisville Gas and Electric Company,Kentucky,
-60549-0001047469-98-012481,LG&E Capital Corp.,Kentucky,
-60549-0001047469-98-012481,LG&E Power Inc.,Delaware,
-60549-0001047469-98-012481,LG&E Power Operations Inc.,California,
-60549-0001047469-98-012481,LG&E Energy Marketing Inc.,Oklahoma,
-60549-0001047469-98-012481,LG&E International Inc.,Delaware,
-60549-0001047469-98-012481,Louisville Gas and Electric Company,,
-60549-0001047469-98-012481,LG&E Capital Corp.,,
-60549-0001047469-98-012481,LG&E Power Inc.,,
-60549-0001047469-98-012481,LG&E International Inc.,,
-60549-0001047469-98-012481,LG&E Energy Marketing Inc.,,
-60549-0001047469-98-012481,LG&E Power Operations ,,
-60549-0001047469-98-012481,LG&E Energy Marketing Inc.,,
-61986-0000061986-99-000003,"Femco Machine Co., Inc.",Nevada,
-61986-0000061986-99-000003,Kolpak Manufacturing Company,Tennessee,
-61986-0000061986-99-000003,"Manitex, Inc.",Texas,
-61986-0000061986-99-000003,"Manitowoc MEC, Inc.",Nevada,
-61986-0000061986-99-000003,"Manitowoc Equipment Works PTE, Ltd.",Singapore,
-61986-0000061986-99-000003,"Manitowoc Equipment Works, Inc.",Nevada,
-61986-0000061986-99-000003,"Manitowoc Europe Holdings, Ltd.",England,
-61986-0000061986-99-000003,Manitowoc Europe Limited,England,
-61986-0000061986-99-000003,Manitowoc International Sales Corp. ,Barbados,
-61986-0000061986-99-000003,"Manitowoc Korea Company, Ltd.",Korea,
-61986-0000061986-99-000003,"Manitowoc Marine Group, Inc",Nevada,
-61986-0000061986-99-000003,"Manitowoc Re-Manufacturing, Inc.",Wisconsin,
-61986-0000061986-99-000003,"Manitowoc Western Company, Inc.",Wisconsin,
-61986-0000061986-99-000003,North Central Crane & Excavator Sales Corp.,Nevada,
-61986-0000061986-99-000003,"West Manitowoc, Inc.",Wisconsin,
-61986-0000061986-99-000003,"Manitowoc CP, Inc. ",Nevada,
-61986-0000061986-99-000003,"Manitowoc FP, Inc.",Nevada,
-61986-0000061986-99-000003,"KMT Refrigeration, Inc.",Wisconsin,
-61986-0000061986-99-000003,"Manitowoc Foodservice Group, Inc.",Nevada,
-61986-0000061986-99-000003,"Manitowoc Crane Group, Inc.",Nevada,
-61986-0000061986-99-000003,"Manitowoc Ice, Inc.",Wisconsin,
-61986-0000061986-99-000003,"Manitowoc Cranes, Inc.",Wisconsin,
-61986-0000061986-99-000003,"SerVend International, Inc.",Nevada,
-61986-0000061986-99-000003,"Manitowoc Beverage Systems, Inc. ",Nevada,
-61986-0000061986-99-000003,KMT Sales Corporation,Nevada,
-61986-0000061986-99-000003,SerVend Sales Corporation,Nevada,
-61986-0000061986-99-000003,"USTC, Inc.",Nevada,
+38725-0000038725-17-000042,Bombas Leao SA,Brazil,100.0
+38725-0000038725-17-000042,Cookson & Zinn (PTL) Limited ,United Kingdom,100.0
+38725-0000038725-17-000042,Coverco S.r.l.,Italy,100.0
+38725-0000038725-17-000042,FE Latin America B.V.,Netherlands,100.0
+38725-0000038725-17-000042,FELE C.V.,Netherlands,100.0
+38725-0000038725-17-000042,"Franklin Control Systems, Inc.",Oregon,100.0
+38725-0000038725-17-000042,Franklin Electric (Australia) Pty. Ltd.,Australia,100.0
+38725-0000038725-17-000042,Franklin Electric (Chile) Ltda,Chile,100.0
+38725-0000038725-17-000042,Franklin Electric (SEA) Pty. Ltd.,Singapore,100.0
+38725-0000038725-17-000042,Franklin Electric (South Africa) Pty. Ltd.,South Africa,100.0
+38725-0000038725-17-000042,"Franklin Electric (Suzhou) Co., Ltd.",China,100.0
+38725-0000038725-17-000042,Franklin Electric (Zambia) Ltd.,Zambia,100.0
+38725-0000038725-17-000042,Franklin Electric B.V.,Netherlands,100.0
+38725-0000038725-17-000042,Franklin Electric Botswana Pty. Ltd.,Botswana,100.0
+38725-0000038725-17-000042,"Franklin Electric Canada, Inc.",Canada,100.0
+38725-0000038725-17-000042,Franklin Electric Colombia SAS,Colombia,100.0
+38725-0000038725-17-000042,Franklin Electric Europa GmbH,Germany,100.0
+38725-0000038725-17-000042,Franklin Electric Germany Holding GmbH,Germany,100.0
+38725-0000038725-17-000042,Franklin Electric Holding B.V.,Netherlands,100.0
+38725-0000038725-17-000042,Franklin Electric India Private Ltd.,India,100.0
+38725-0000038725-17-000042,Franklin Electric Industria de Motobombas SA,Brazil,100.0
+38725-0000038725-17-000042,"Franklin Electric International, Inc.",Delaware,100.0
+38725-0000038725-17-000042,Franklin Electric NL BV,Netherlands,100.0
+38725-0000038725-17-000042,Franklin Electric spol s.r.o.,Czech Republic,100.0
+38725-0000038725-17-000042,"Franklin Electric Subsidiaries, LLC",Indiana,100.0
+38725-0000038725-17-000042,"Franklin Electric Trading (Shanghai) Co., Ltd.",China,100.0
+38725-0000038725-17-000042,Franklin Fueling Sistemas de Combustiveis Ltda,Brazil,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems (Beijing) Company Ltd.,China,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems Australia Pty. Ltd.,Australia,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems France SARL,France,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems India Private Ltd.,India,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems Ltd.,United Kingdom,100.0
+38725-0000038725-17-000042,Franklin Fueling Systems GmbH,Germany,100.0
+38725-0000038725-17-000042,"Franklin Fueling Systems, Inc.",Indiana,100.0
+38725-0000038725-17-000042,Impo Motor Pompa Sanayi ve Ticaret A.S.,Turkey,90.0
+38725-0000038725-17-000042,"Intelligent Controls, LLC",Maine,100.0
+38725-0000038725-17-000042,Motores Electricos Sumergibles de Mexico S. de R.L de C.V.,Mexico,100.0
+38725-0000038725-17-000042,Motores Franklin S.A. de C.V.,Mexico,100.0
+38725-0000038725-17-000042,Motori Sommersi Riavvolgibili S.r.l.,Italy,75.0
+38725-0000038725-17-000042,Pioneer Pump Holdings Pty.,Australia,100.0
+38725-0000038725-17-000042,Pioneer Pump Ltd.,United Kingdom,100.0
+38725-0000038725-17-000042,Pioneer Pump Pty. Ltd. ,South Africa,100.0
+38725-0000038725-17-000042,Pioneer Pump Solutions Ltd.,United Kingdom,100.0
+38725-0000038725-17-000042,"Pioneer Pump, Inc.",Texas,100.0
+38725-0000038725-17-000042,Pluga Pumps and Motors Private Limited,India,70.0
+38725-0000038725-17-000042,Servicios de MESMEX S de SRL de CV,Mexico,100.0
+38725-0000038725-17-000042,Franklin Electric S.r.l,Italy,100.0
+38725-0000038725-17-000042,Franklin Wadcorpp India Private Limited,India,65.0
 71675-0001046861-02-000012,Central Maine Power Company,Maine,
 71675-0001046861-02-000012,Connecticut Natural Gas Corporation,Connecticut,
 71675-0001046861-02-000012,"Energy East Enterprises, Inc.",Maine,
@@ -1045,25 +941,18 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 77227-0001031296-09-000008,"GPU Power, Inc.",Delaware,
 77227-0001031296-09-000008,FirstEnergy Foundation,Ohio,
 77227-0001031296-09-000008,FirstEnergy Fiber Holdings Corp.,Delaware,
-78778-0000078778-97-000019,Piper Jaffray Inc.,Delaware,100
-78778-0000078778-97-000019,Piper Jaffray International Inc.,Delaware,100
-78778-0000078778-97-000019,Piper Capital Management Incorporated,Delaware,100
-78778-0000078778-97-000019,Piper Trust Company,Minnesota,100
-78778-0000078778-97-000019,Premier Acceptance Corporation,Delaware,100
-78778-0000078778-97-000019,Piper Realty Management Incorporated,Delaware,100
-78778-0000078778-97-000019,"Piper Jaffray Ventures, Inc. ",Delaware,100
 78890-0000078890-14-000004,The Pittston Company,Delaware,
 78890-0000078890-14-000004,"Glen Allen Development, Inc.",Delaware,
-78890-0000078890-14-000004,"Liberty National Development Company, LLC (32.5%)",Delaware,
-78890-0000078890-14-000004,"New Liberty Residential Urban Renewal Company, LLC (17.5%)",New Jersey,
+78890-0000078890-14-000004,"Liberty National Development Company, LLC",Delaware,32.5
+78890-0000078890-14-000004,"New Liberty Residential Urban Renewal Company, LLC",New Jersey,17.5
 78890-0000078890-14-000004,Pittston Services Group Inc.,Virginia,
 78890-0000078890-14-000004,Brink’s Holding Company,Delaware,
 78890-0000078890-14-000004,"Brink’s, Incorporated (“BI”)",Delaware,
 78890-0000078890-14-000004,"Brink’s Delaware, LLC",Delaware,
 78890-0000078890-14-000004,Brink’s Express Company,Illinois,
 78890-0000078890-14-000004,"Brink’s Global Payments, LLC",Delaware,
-78890-0000078890-14-000004,Brink’s St. Lucia Ltd.,St. Lucia,26
-78890-0000078890-14-000004,Security Services (Brink’s Jordan) Company Ltd,Jordan,95
+78890-0000078890-14-000004,Brink’s St. Lucia Ltd.,St. Lucia,26.0
+78890-0000078890-14-000004,Security Services (Brink’s Jordan) Company Ltd,Jordan,95.0
 78890-0000078890-14-000004,"Servicio Pan Americano de Protección S.A. de C.V. (“Serpaprosa”) (by Trust, BI is Settlor of Trust)",Mexico,99.75
 78890-0000078890-14-000004,"Aeroflash Mensajeria, S.A. de C.V.",Mexico,99.75
 78890-0000078890-14-000004,"Inmobiliaria, A.J., S.A. de C.V.",Mexico,99.75
@@ -1111,15 +1000,15 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,Inversiones Petra S.A.,Chile,
 78890-0000078890-14-000004,"Brink’s Chile, S.A. (BICV is beneficial owner)",Chile,
 78890-0000078890-14-000004,Organismo Tecnico de Capacitacion Brink’s SpA,Chile,
-78890-0000078890-14-000004,Brink’s de Colombia S.A.,Colombia,58
-78890-0000078890-14-000004,Domesa de Colombia S.A.,Colombia,70
-78890-0000078890-14-000004,Procesos & Canje S.A.,Colombia,58
+78890-0000078890-14-000004,Brink’s de Colombia S.A.,Colombia,58.0
+78890-0000078890-14-000004,Domesa de Colombia S.A.,Colombia,70.0
+78890-0000078890-14-000004,Procesos & Canje S.A.,Colombia,58.0
 78890-0000078890-14-000004,Sistema Integrado Multiple de Pago Electronicos S.A. (“SIMPLE S.A.”),Colombia,14.5
 78890-0000078890-14-000004,"Brink’s Canada Holdings, B.V. (BICV is beneficial owner)",Netherlands,
 78890-0000078890-14-000004,Brink’s Canada Limited,Canada,
 78890-0000078890-14-000004,"Brink’s Security Services, B.V.",Netherlands,
 78890-0000078890-14-000004,"Centro Americana de Inversiones Balboa, C.A. (BICV is beneficial owner)",Panama,
-78890-0000078890-14-000004,Hermes Transporte Blindados S.A.,Peru,36
+78890-0000078890-14-000004,Hermes Transporte Blindados S.A.,Peru,36.0
 78890-0000078890-14-000004,"Brink’s Dutch Holdings, B.V. (BICV is beneficial owner)",Netherlands,
 78890-0000078890-14-000004,"Brink’s Hellenic Holdings, B.V. (“BHH”)",Netherlands,
 78890-0000078890-14-000004,"Athena Marathon Holdings, B.V. (“AMH”)",Netherlands,
@@ -1131,33 +1020,33 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,Brink’s Hermes Cash & Valuable Services S.A. (“Brink’s Cash & Valuable Services SA”),Greece,
 78890-0000078890-14-000004,Brink's Hellas Guarding & Cash Services Joint VentureAnonymi Etairia,Greece,
 78890-0000078890-14-000004,Brink’s Hermes Security Services SA (“Brink’s Security Services S.A.”),Greece,
-78890-0000078890-14-000004,Brink’s Hermes Aviation Security Services S.A.,Greece,70
-78890-0000078890-14-000004,Hellenic Central Station SA - Reception & Processing Centre of Electronic Signals (“Hellenic Central Station”),Greece,10
+78890-0000078890-14-000004,Brink’s Hermes Aviation Security Services S.A.,Greece,70.0
+78890-0000078890-14-000004,Hellenic Central Station SA - Reception & Processing Centre of Electronic Signals (“Hellenic Central Station”),Greece,10.0
 78890-0000078890-14-000004,"BHM Human Resources Mexico Holding, S.A. de C.V.",Mexico,
 78890-0000078890-14-000004,"Servicios Administrativos Consolidados BM de Mexico, S.A. de C.V.",Mexico,
 78890-0000078890-14-000004,"BM Control y Administracion de Personal, S.A. de C.V.",Mexico,
 78890-0000078890-14-000004,BHM Human Resources Solutions B.V.,Netherlands,
 78890-0000078890-14-000004,Brink’s Argentina S.A.,Argentina,
-78890-0000078890-14-000004,Brink’s Seguridad Corporativa S.A.,Argentina,98
+78890-0000078890-14-000004,Brink’s Seguridad Corporativa S.A.,Argentina,98.0
 78890-0000078890-14-000004,Brink’s India Private Limited,India,
-78890-0000078890-14-000004,Brinks Mongolia LLC,Mongolia,51
-78890-0000078890-14-000004,Brink’s RUS Holding B.V.,Netherlands,70
-78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70
-78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70
-78890-0000078890-14-000004,Non Banking Credit Organization BRINKS,Russian Federation,70
-78890-0000078890-14-000004,Servicio Pan Americano de Proteccion C.A.,Venezuela,61
-78890-0000078890-14-000004,"Aeropanamericano, C.A.",Venezuela,61
-78890-0000078890-14-000004,"Aero Sky Panama, S.A.",Panama,61
-78890-0000078890-14-000004,"Artes Graficas Avanzadas 98, C.A.",Venezuela,61
-78890-0000078890-14-000004,"Blindados de Zulia Occidente, C.A.",Venezuela,61
-78890-0000078890-14-000004,"Blindados de Oriente, S.A.",Venezuela,61
-78890-0000078890-14-000004,"Blindados Panamericanos, S.A.",Venezuela,61
-78890-0000078890-14-000004,"Blindados Centro Occidente, S.A.",Venezuela,61
-78890-0000078890-14-000004,"Documentos Mercantiles, S.A.",Venezuela,61
-78890-0000078890-14-000004,"Instituto Panamericano, C.A.",Venezuela,61
-78890-0000078890-14-000004,"Intergraficas Panama, S.A.",Panama,61
-78890-0000078890-14-000004,"Panamericana de Vigilancia, S.A. ",Venezuela,61
-78890-0000078890-14-000004,"Transportes Expresos, C.A. ",Venezuela,61
+78890-0000078890-14-000004,Brinks Mongolia LLC,Mongolia,51.0
+78890-0000078890-14-000004,Brink’s RUS Holding B.V.,Netherlands,70.0
+78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70.0
+78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70.0
+78890-0000078890-14-000004,Non Banking Credit Organization BRINKS,Russian Federation,70.0
+78890-0000078890-14-000004,Servicio Pan Americano de Proteccion C.A.,Venezuela,61.0
+78890-0000078890-14-000004,"Aeropanamericano, C.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Aero Sky Panama, S.A.",Panama,61.0
+78890-0000078890-14-000004,"Artes Graficas Avanzadas 98, C.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Blindados de Zulia Occidente, C.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Blindados de Oriente, S.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Blindados Panamericanos, S.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Blindados Centro Occidente, S.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Documentos Mercantiles, S.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Instituto Panamericano, C.A.",Venezuela,61.0
+78890-0000078890-14-000004,"Intergraficas Panama, S.A.",Panama,61.0
+78890-0000078890-14-000004,"Panamericana de Vigilancia, S.A. ",Venezuela,61.0
+78890-0000078890-14-000004,"Transportes Expresos, C.A. ",Venezuela,61.0
 78890-0000078890-14-000004,Brink’s Panama S.A.,Panama,
 78890-0000078890-14-000004,Inmobiliaria Brink’s Panama S.A.,Panama,
 78890-0000078890-14-000004,Brink’s Global Services Poland Sp.zo.o.,Poland,
@@ -1179,9 +1068,9 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,Brink’s Évolution S.A.R.L.,France,
 78890-0000078890-14-000004,Est Valeurs SAS,France,
 78890-0000078890-14-000004,Brink’s Formation S.A.R.L.,France,
-78890-0000078890-14-000004,Brink’s Madagascar S.A.,Madagascar,60
+78890-0000078890-14-000004,Brink’s Madagascar S.A.,Madagascar,60.0
 78890-0000078890-14-000004,Brink’s Maroc S.A.S.,Morocco,
-78890-0000078890-14-000004,Brink’s Qatar L.L.C.,Qatar,49
+78890-0000078890-14-000004,Brink’s Qatar L.L.C.,Qatar,49.0
 78890-0000078890-14-000004,Brink’s Réunion S.A.R.L.,St. Denis,
 78890-0000078890-14-000004,Brink’s Security Services SAS,France,
 78890-0000078890-14-000004,Brink’s Teleservices SAS,France,
@@ -1207,7 +1096,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,Brink’s Diamond (Shanghai) Company Limited,China,
 78890-0000078890-14-000004,Brink’s Jewellery Trading (Shanghai) Company Limited,China,
 78890-0000078890-14-000004,Brink’s Security Transportation (Shanghai) Company Limited,China,
-78890-0000078890-14-000004,Brink’s Global Services Korea Limited – Yunan Hoesa Brink’s Global,Korea,80
+78890-0000078890-14-000004,Brink’s Global Services Korea Limited – Yunan Hoesa Brink’s Global,Korea,80.0
 78890-0000078890-14-000004,Brink’s Nederland B.V.,Netherlands,
 78890-0000078890-14-000004,Brink’s Geldverwerking B.V.,Netherlands,
 78890-0000078890-14-000004,Brink’s Regional Services B.V.,Netherlands,
@@ -1218,12 +1107,12 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,Redetrel – Rede Transacoes Eletronicas Ltda.,Brazil,
 78890-0000078890-14-000004,ePago International Inc.,Panama,
 78890-0000078890-14-000004,"Corporación ePago de Venezuela, C.A.",Venezuela,
-78890-0000078890-14-000004,e-Pago de Colombia S.A. ,Colombia,75
+78890-0000078890-14-000004,e-Pago de Colombia S.A. ,Colombia,75.0
 78890-0000078890-14-000004,Brink’s ePago S.A. de C.V.,Mexico,
 78890-0000078890-14-000004,Brink’s Global Services (BGS) Botswana (Proprietary) Limited,Botswana,
 78890-0000078890-14-000004,Brink’s Macau Limited,Macao,
 78890-0000078890-14-000004,Brink’s Taiwan Security Limited,Taiwan,
-78890-0000078890-14-000004,Brink’s (Thailand) Limited,Thailand,40
+78890-0000078890-14-000004,Brink’s (Thailand) Limited,Thailand,40.0
 78890-0000078890-14-000004,Brink’s Global Technology Limited,Thailand,
 78890-0000078890-14-000004,Brink’s Guvenlik Hizmetleri Anonim Sirketi,Turkey,
 78890-0000078890-14-000004,Brink’s (UK) Limited,U.K.,
@@ -1269,7 +1158,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 78890-0000078890-14-000004,PMV Gold Company,Delaware,
 78890-0000078890-14-000004,Pittston Mineral Ventures International Ltd.,Delaware,
 78890-0000078890-14-000004,Mineral Ventures of Australia Pty Ltd,Australia,
-80812-0000927016-98-004349,"ProvEnergy Investments, Ltd.",Rhode Island,
 86521-0000086521-10-000019,Enova Corporation,California,
 86521-0000086521-10-000019,Pacific Enterprises,California,
 86521-0000086521-10-000019,Pacific Enterprises International,California,
@@ -1341,26 +1229,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 96271-0001193125-07-042781,"TPS International Power, Inc.",Cayman Islands,
 96271-0001193125-07-042781,"TPS de Ultramar, LTD",Cayman Islands,
 96271-0001193125-07-042781,"TPS de Ultramar Guatemala, S.A.",Guatemalan,
-99250-0000099250-00-000002,Cardinal Operating Company,Delaware,100
-99250-0000099250-00-000002,Cross Bay Operating Company,Delaware,100
-99250-0000099250-00-000002,Cumberland Operating Company,Delaware,100
-99250-0000099250-00-000002,Independence Operating Company,Delaware,100
-99250-0000099250-00-000002,"Marsh Resources, Inc.",Delaware,100
-99250-0000099250-00-000002,Pine Needle Operating Company,Delaware,100
-99250-0000099250-00-000002,"TGPL Enterprises, Inc.",Delaware,100
-99250-0000099250-00-000002,Transco Cross Bay Company,Delaware,100
-99250-0000099250-00-000002,TransCardinal Company,Delaware,100
-99250-0000099250-00-000002,TransCarolina LNG Company,Delaware,100
-99250-0000099250-00-000002,TransCumberland Pipeline Company,Delaware,100
-99250-0000099250-00-000002,Transco Independence Pipeline Company ,Delaware,100
-99250-0000099250-00-000002,"Delaware WGP Enterprises, Inc",Delaware,100
-99250-0000099250-00-000002,"Williams Gas Processing - Gulf Coast Company, L.P.",Delaware,99
-100122-0000941138-03-000007,Tucson Electric Power Company (TEP),Arizona,
-100122-0000941138-03-000007,San Carlos Resources Inc.,Arizona,
-100122-0000941138-03-000007,"Millennium Energy Holdings, Inc. (Millennium)",Arizona,
-100122-0000941138-03-000007,"Advanced Energy Technologies, Inc.",Arizona,
-100122-0000941138-03-000007,"Global Solar Energy, Inc.",Arizona,
-100122-0000941138-03-000007,UniSource Energy Development (UED),Arizona,
 103872-0001193125-13-444053,14011 So. Normandie Ave. Realty Corp.,Nevada,
 103872-0001193125-13-444053,500 South Douglas Realty Corp.,Delaware,
 103872-0001193125-13-444053,Arctern Consulting Private Limited (2),India,
@@ -1462,7 +1330,1077 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 103872-0001193125-13-444053,"Volt Telecommunications Group, Inc.",Delaware,
 103872-0001193125-13-444053,"Volt Temporary Services, Inc.",Delaware,
 103872-0001193125-13-444053,"Volt Workforce Solutions, Inc.",Delaware,
-320575-0001193125-07-117419,PDC MGMT. CO. (formerly SOTEX Exploration Company),Texas,100
-320575-0001193125-07-117419,"PDC Investment Corp.,",Delaware,100
-320575-0001193125-07-117419,"Pioneer Drilling Services, Ltd. (formerly Pioneer Drilling Co., Ltd.)",Texas,100
-320575-0001193125-07-117419,South Texas Drilling Company,Texas,100
+320575-0001193125-07-117419,PDC MGMT. CO. (formerly SOTEX Exploration Company),Texas,100.0
+320575-0001193125-07-117419,"PDC Investment Corp.,",Delaware,100.0
+320575-0001193125-07-117419,"Pioneer Drilling Services, Ltd. (formerly Pioneer Drilling Co., Ltd.)",Texas,100.0
+320575-0001193125-07-117419,South Texas Drilling Company,Texas,100.0
+3499-0000003499-08-000003,731 Commercial Holding LLC,,
+3499-0000003499-08-000003,731 Commercial LLC,,
+3499-0000003499-08-000003,731 Office One Holding LLC,,
+3499-0000003499-08-000003,731 Office One LLC,,
+3499-0000003499-08-000003,731 Office Two Holding LLC,,
+3499-0000003499-08-000003,731 Office Two LLC,,
+3499-0000003499-08-000003,731 Residential Holding LLC,,
+3499-0000003499-08-000003,731 Residential LLC,,
+3499-0000003499-08-000003,731 Restaurant LLC,,
+3499-0000003499-08-000003,731 Retail One LLC,,
+3499-0000003499-08-000003,"Alexander’s Department Stores of Brooklyn, Inc.",,
+3499-0000003499-08-000003,"Alexander’s Department Stores of New Jersey, Inc.",,
+3499-0000003499-08-000003,"Alexander’s Kings Plaza, LLC",,
+3499-0000003499-08-000003,"Alexander’s of Kings, LLC",,
+3499-0000003499-08-000003,Alexander’s Management LLC,,
+3499-0000003499-08-000003,Alexander’s of Brooklyn II LLC,,
+3499-0000003499-08-000003,"Alexander’s of Brooklyn, Inc.",,
+3499-0000003499-08-000003,"Alexander’s of Flushing, Inc.",,
+3499-0000003499-08-000003,"Alexander’s of Rego Park II, Inc.",,
+3499-0000003499-08-000003,"Alexander’s of Rego Park III, Inc.",,
+3499-0000003499-08-000003,"ALX of Paramus, LLC",,
+3499-0000003499-08-000003,"Alexander’s Personnel Providers, Inc.",,
+3499-0000003499-08-000003,"Alexander’s Rego Park Center, Inc.",,
+3499-0000003499-08-000003,"Alexander’s Rego Shopping Center, Inc.",,
+3499-0000003499-08-000003,Alexander’s Restaurant LLC,,
+3499-0000003499-08-000003,"Kings Parking, LLC",,
+3499-0000003499-08-000003,Kings Plaza Lender LLC,,
+3499-0000003499-08-000003,Ownreal Inc.,,
+3499-0000003499-08-000003,Rego Park Comercial LLC,,
+3499-0000003499-08-000003,Rego Park Residential LLC,,
+3499-0000003499-08-000003,"Sakraf Wine & Liquor Store, Inc.",,
+3570-0000003570-17-000052,Caldera LNG Holdings SpA,Chile,
+3570-0000003570-17-000052,Cheniere Chile SpA,Chile,
+3570-0000003570-17-000052,"Cheniere CCH HoldCo I, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere CCH HoldCo II, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Corpus Christi Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Corpus Christi Pipeline, L.P.",Delaware,
+3570-0000003570-17-000052,"Cheniere Corpus Christi Pipeline Stage III, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Creole Trail Pipeline, L.P.",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Investments, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Operating Co., Inc",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Partners GP, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Partners LP Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Partners, L.P.",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Shared Services Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Energy Shared Services, Inc.",Delaware,
+3570-0000003570-17-000052,"Cheniere Field Services, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere GP Holding Company, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Ingleside Marine Terminal, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere International Investments Holdings, S.à.r.l",Luxembourg,
+3570-0000003570-17-000052,"Cheniere International Investments, S.à.r.l",Luxembourg,
+3570-0000003570-17-000052,"Cheniere Land Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Liquids, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere LNG Holdings GP, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere LNG O&M Services, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere LNG Terminals, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Marketing International HoldCo I, L.P.",Bermuda,
+3570-0000003570-17-000052,"Cheniere Marketing International HoldCo II, Ltd.",Bermuda,
+3570-0000003570-17-000052,"Cheniere Marketing International, LLP",United Kingdom,
+3570-0000003570-17-000052,"Cheniere Marketing, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Marketing, Ltd.",United Kingdom,
+3570-0000003570-17-000052,Cheniere Marketing PTE Ltd.,Singapore,
+3570-0000003570-17-000052,"Cheniere Midship Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Midstream Holdings, Inc.",Delaware,
+3570-0000003570-17-000052,"Cheniere Pipeline GP Interests, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Pipeline Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere San Patricio Processing Hub, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Southern Trail GP, Inc.",Delaware,
+3570-0000003570-17-000052,"Cheniere SPH Pipeline, LLC",Delaware,
+3570-0000003570-17-000052,"Cheniere Supply & Marketing, Inc.",Delaware,
+3570-0000003570-17-000052,Concepción LNG Holding SpA,Chile,
+3570-0000003570-17-000052,"Corpus Christi Liquefaction, LLC",Delaware,
+3570-0000003570-17-000052,"Corpus Christi Liquefaction Stage III, LLC",Delaware,
+3570-0000003570-17-000052,"Corpus Christi LNG, LLC",Delaware,
+3570-0000003570-17-000052,"Corpus Christi Pipeline GP, LLC",Delaware,
+3570-0000003570-17-000052,"Corpus Christi Tug Services, LLC",Delaware,
+3570-0000003570-17-000052,"CQH Holdings Company, LLC",Delaware,
+3570-0000003570-17-000052,"CUI I, LLC",Delaware,
+3570-0000003570-17-000052,"Johnson Bayou Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Live Oak LNG Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Louisiana LNG Holdings, LLC",Delaware,
+3570-0000003570-17-000052,"Nordheim Eagle Ford Gathering, LLC",Delaware,
+3570-0000003570-17-000052,"Sabine Pass Liquefaction, LLC",Delaware,
+3570-0000003570-17-000052,"Sabine Pass LNG-GP, LLC",Delaware,
+3570-0000003570-17-000052,"Sabine Pass LNG-LP, LLC",Delaware,
+3570-0000003570-17-000052,"Sabine Pass LNG, L.P.",Delaware,
+3570-0000003570-17-000052,"Sabine Pass Tug Services, LLC",Delaware,
+4127-0000004127-17-000033,"Skyworks Filter Solutions Japan Co., Ltd.",Japan,
+4127-0000004127-17-000033,Skyworks Global Pte. Ltd.,Singapore,
+4127-0000004127-17-000033,"Skyworks International Investments, LLC",Delaware,
+4127-0000004127-17-000033,Skyworks Ireland Limited,Ireland,
+4127-0000004127-17-000033,Skyworks Luxembourg S.a r.l,Luxembourg,
+4127-0000004127-17-000033,Skyworks Semiconductor,France,
+4127-0000004127-17-000033,Skyworks Solutions Canada Inc.,Canada,
+4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen)",Peoples Republic of China,
+4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen) - Beijing Branch",Beijing,
+4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen) - Shanghai Branch",Shanghai,
+4127-0000004127-17-000033,"Skyworks Solutions Co, Limited",Japan,
+4127-0000004127-17-000033,"Skyworks Solutions de Mexico, S de R.L. de C.V.",Mexico,
+4127-0000004127-17-000033,Skyworks Solutions (Hong Kong) Limited,Hong Kong,
+4127-0000004127-17-000033,Skyworks Solutions Ireland Limited,Ireland,
+4127-0000004127-17-000033,Skyworks Solutions Korea Limited,Korea,
+4127-0000004127-17-000033,Skyworks Solutions Limited,United Kingdom,
+4127-0000004127-17-000033,Skyworks Solutions Oy,Finland,
+4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc.",Delaware,
+4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Taiwan Branch",Taiwan,
+4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Malaysia Branch",Malaysia,
+4127-0000004127-17-000033,Advanced Analogic Technologies Incorporated,Delaware,
+4127-0000004127-17-000033,"Advanced Analogic Technologies (China), Inc.",Peoples Republic of China,
+4127-0000004127-17-000033,Axiom Microdevices Inc.,Delaware,
+4127-0000004127-17-000033,ICWave LLC,Massachusetts,
+4127-0000004127-17-000033,Isolink inc.,California,
+4127-0000004127-17-000033,MEMS Solutions Inc.,Korea,
+4127-0000004127-17-000033,Quantance Inc.,Delaware,
+4127-0000004127-17-000033,SiGe Semiconductor Inc.,Delaware,
+4127-0000004127-17-000033,SiGe Semiconductor (U.S.) Corp.,Delaware,
+4127-0000004127-17-000033,SiGe Semiconductor (Europe) Limited,United Kingdom,
+4127-0000004127-17-000033,"Trans-Tech, Inc.",Maryland,
+4962-0001193125-10-041232,American Express Company,(USA) New York,
+4962-0001193125-10-041232,56th Street AXP Campus LLC,(USA) Arizona,
+4962-0001193125-10-041232,American Express Austria Bank GmbH,Austria,
+4962-0001193125-10-041232,American Express Bank LLC,Russian Federation,
+4962-0001193125-10-041232,American Express Bank Ltd. S.A,Argentina,
+4962-0001193125-10-041232,American Express Banking Corp.,(USA) New York,
+4962-0001193125-10-041232,"American Express Travel Related Services Company, Inc.",(USA) New York,
+4962-0001193125-10-041232,American Express Bank (Mexico) S.A Institucion de Banca Multiple,Mexico,
+4962-0001193125-10-041232,"American Express Bank Services, S.A. de C.V.",Mexico,
+4962-0001193125-10-041232,American Express Bank FSB,United States,
+4962-0001193125-10-041232,American Express Receivables Financing Corporation IV LLC,(USA) Delaware,
+4962-0001193125-10-041232,American Express Business Loan Corporation,(USA) Utah,
+4962-0001193125-10-041232,American Express Centurion Bank,(USA) Utah,
+4962-0001193125-10-041232,American Express Receivables Financing Corporation III LLC,(USA) Delaware,
+4962-0001193125-10-041232,American Express Company (Mexico) S.A. de C.V.,Mexico,
+4962-0001193125-10-041232,"American Express Insurance Services, Agente de Seguros, S.A. de C.V.",Mexico,
+4962-0001193125-10-041232,"American Express Servicios Profesionales, S.A. de C.V.",Mexico,
+4962-0001193125-10-041232,American Express Credit Corporation,(USA) Delaware,
+4962-0001193125-10-041232,American Express Capital Australia,Australia,
+4962-0001193125-10-041232,"American Express Credit Mexico, LLC",(USA) Delaware,
+4962-0001193125-10-041232,Fideicomiso Empresarial American Express No.232033,Mexico,
+4962-0001193125-10-041232,American Express Euro Funding Limited Partnership,United Kingdom,
+4962-0001193125-10-041232,American Express Overseas Credit Corporation Limited,Jersey,
+4962-0001193125-10-041232,AEOCC Management Company Limited,Jersey,
+4962-0001193125-10-041232,American Express Overseas Credit Corporation N.V.,Netherlands Antilles,
+4962-0001193125-10-041232,AE Hungary Holdings Limited Liability Company,Hungary,
+4962-0001193125-10-041232,American Express Canada Credit Corporation,Canada,
+4962-0001193125-10-041232,American Express Canada Finance Limited,Canada,
+4962-0001193125-10-041232,American Express Sterling Funding Limited Partnership,United Kingdom,
+4962-0001193125-10-041232,American Express Funding (Luxembourg) S.a.r.l,Luxembourg,
+4962-0001193125-10-041232,Credco Receivables Corp.,(USA) Delaware,
+4962-0001193125-10-041232,"American Express Dutch Capital, LLC",(USA) Delaware,
+4962-0001193125-10-041232,American Express Europe Limited,(USA) Delaware,
+4962-0001193125-10-041232,Sceptre Nominees Limited,United Kingdom,
+4962-0001193125-10-041232,"American Express Global Financial Services, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,American Express Holdings Netherlands CV,Netherlands,
+4962-0001193125-10-041232,"American Express Insurance Agency of Puerto Rico, Inc.",Puerto Rico,
+4962-0001193125-10-041232,"American Express International (NZ), Inc.",(USA) Delaware,
+4962-0001193125-10-041232,American Express Limited,(USA) Delaware,
+4962-0001193125-10-041232,Alpha Card SCRL,Belgium,
+4962-0001193125-10-041232,Alpha Card Merchant Services SCRL,Belgium,
+4962-0001193125-10-041232,BCC Corporate NV/SA,Belgium,
+4962-0001193125-10-041232,American Express (Malaysia) SDN. BHD.,Malaysia,
+4962-0001193125-10-041232,American Express (Thai) Co. Ltd,Thailand,
+4962-0001193125-10-041232,American Express Brasil Assessoria Empresarial Ltda.,Brazil,
+4962-0001193125-10-041232,American Express International (B) SDN.BHD,Brunei Darussalam,
+4962-0001193125-10-041232,"American Express International Holdings, LLC",(USA) Delaware,
+4962-0001193125-10-041232,American Express Argentina S.A.,Argentina,
+4962-0001193125-10-041232,American Express Holdings (France) SAS,France,
+4962-0001193125-10-041232,American Express France SAS,France,
+4962-0001193125-10-041232,American Express Carte France SA,France,
+4962-0001193125-10-041232,American Express Change SAS,France,
+4962-0001193125-10-041232,American Express Paris SAS,France,
+4962-0001193125-10-041232,American Express Services SA,France,
+4962-0001193125-10-041232,American Express Voyages SAS,France,
+4962-0001193125-10-041232,American Express Management,France,
+4962-0001193125-10-041232,American Express France Finance SNC,France,
+4962-0001193125-10-041232,South Pacific Credit Card Limited,New Zealand,
+4962-0001193125-10-041232,Centurion Finance Limited,New Zealand,
+4962-0001193125-10-041232,"American Express International, Inc",(USA) Delaware,
+4962-0001193125-10-041232,AE Exposure Management Limited,Jersey,
+4962-0001193125-10-041232,American Express (India) Private Limited,India,
+4962-0001193125-10-041232,American Express Asia Network Consulting (Beijing) Limited Company,China,
+4962-0001193125-10-041232,American Express Australia Limited,Australia,
+4962-0001193125-10-041232,American Express Company AS,Norway,
+4962-0001193125-10-041232,American Express Corporate Travel SA,Belgium,
+4962-0001193125-10-041232,American Express Denmark A/S,Denmark,
+4962-0001193125-10-041232,American Express Group Services Limited,United Kingdom,
+4962-0001193125-10-041232,American Express Holding AB,Sweden,
+4962-0001193125-10-041232,American Express Business Travel A/S,Denmark,
+4962-0001193125-10-041232,American Express Business Travel AB,Sweden,
+4962-0001193125-10-041232,American Express Business Travel AS,Norway,
+4962-0001193125-10-041232,Forsakringsaktiebolaget Viator,Sweden,
+4962-0001193125-10-041232,American Express Holdings Limited,United Kingdom,
+4962-0001193125-10-041232,American Express Insurance Services Europe Limited,United Kingdom,
+4962-0001193125-10-041232,American Express Services Europe Limited,United Kingdom,
+4962-0001193125-10-041232,American Express Hungary Financial Services Closed Company Limited by Shares,Hungary,
+4962-0001193125-10-041232,American Express Hungary Travel Services Ltd.,Hungary,
+4962-0001193125-10-041232,"American Express International (Taiwan), Inc.","Taiwan, Province of China",
+4962-0001193125-10-041232,American Express International SA,Greece,
+4962-0001193125-10-041232,Key Tours SA,Greece,
+4962-0001193125-10-041232,"American Express Japan Co., Ltd",Japan,
+4962-0001193125-10-041232,American Express Locazioni Finanziarie s.r.l,Italy,
+4962-0001193125-10-041232,American Express Payment Services Limited,United Kingdom,
+4962-0001193125-10-041232,American Express Poland S.A.,Poland,
+4962-0001193125-10-041232,American Express Reisebüro GmbH,Austria,
+4962-0001193125-10-041232,American Express Services India Limited,India,
+4962-0001193125-10-041232,American Express spol. s.r.o.,Czech Republic,
+4962-0001193125-10-041232,American Express Swiss Holdings GmbH,Switzerland,
+4962-0001193125-10-041232,Swisscard AECS AG,Switzerland,
+4962-0001193125-10-041232,American Express Travel (Singapore) Pte. Ltd.,Singapore,
+4962-0001193125-10-041232,American Express Travel Holdings (Hong Kong) Limited,Hong Kong,
+4962-0001193125-10-041232,CITS American Express Air Services Ltd,China,
+4962-0001193125-10-041232,CITS American Express Southern Air Services Ltd,China,
+4962-0001193125-10-041232,CITS American Express Travel Services Ltd,China,
+4962-0001193125-10-041232,Farrington American Express Travel Services Limited,Hong Kong,
+4962-0001193125-10-041232,American Express Travel Holdings (M) Company SDN. BHD.,Malaysia,
+4962-0001193125-10-041232,Mayflower American Express Travel Services SDN. BHD.,Malaysia,
+4962-0001193125-10-041232,"American Express Travel Services Vostok, LLC",Russian Federation,
+4962-0001193125-10-041232,ZAO “American Express International Services”,Russian Federation,
+4962-0001193125-10-041232,American Express Wholesale Currency Services Pty Limited,Australia,
+4962-0001193125-10-041232,Amex Broker Assicurativo s.r.l.,Italy,
+4962-0001193125-10-041232,"Amex General Insurance Agency, Inc.","Taiwan, Province of China",
+4962-0001193125-10-041232,"Amex Life Insurance Marketing, Inc.","Taiwan, Province of China",
+4962-0001193125-10-041232,Amex Travel Holding (Japan) Ltd.,Japan,
+4962-0001193125-10-041232,"American Express Nippon Travel Agency, Inc.",Japan,
+4962-0001193125-10-041232,Interactive Transaction Solutions Limited,United Kingdom,
+4962-0001193125-10-041232,Interactive Transactions Solutions SAS,France,
+4962-0001193125-10-041232,Sociedad Internacional de Servicios de Panama S.A.,Panama,
+4962-0001193125-10-041232,TransUnion Limited,Hong Kong,
+4962-0001193125-10-041232,American Express Service (Thailand) Company Limited,Thailand,
+4962-0001193125-10-041232,"TRS Card International, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,"American Express de Espana, S.A. (Sociedad Unipersonal)",Spain,
+4962-0001193125-10-041232,"American Express E.F.C., S.A. (Sociedad Unipersonal)",Spain,
+4962-0001193125-10-041232,"American Express Foreign Exchange, S.A. (Sociedad Unipersonal)",Spain,
+4962-0001193125-10-041232,"American Express Viajes, S.A. (Sociedad Unipersonal)",Spain,
+4962-0001193125-10-041232,American Express Barcelo Viajes SL,Spain,
+4962-0001193125-10-041232,"Amex Asesores de Seguros, S.A. (Sociedad Unipersonal)",Spain,
+4962-0001193125-10-041232,American Express Marketing & Development Corp.,(USA) Delaware,
+4962-0001193125-10-041232,American Express Prepaid Card Management Corporation,(USA) Arizona,
+4962-0001193125-10-041232,American Express Publishing Corporation,(USA) New York,
+4962-0001193125-10-041232,American Express Receivables Financing Corporation II,(USA) Delaware,
+4962-0001193125-10-041232,American Express Receivables Financing Corporation V LLC,(USA) Delaware,
+4962-0001193125-10-041232,Amex (Middle East) B.S.C. (c),Bahrain,
+4962-0001193125-10-041232,Amex Al Omania LLC,Oman,
+4962-0001193125-10-041232,Amex Egypt LLC,Egypt,
+4962-0001193125-10-041232,ASAL (American Express Saudi Arabia Ltd),Bahrain,
+4962-0001193125-10-041232,Amex Bank of Canada,Canada,
+4962-0001193125-10-041232,Amex Canada Inc.,Canada,
+4962-0001193125-10-041232,Amex Card Services Company,(USA) Delaware,
+4962-0001193125-10-041232,Asesorías e Inversiones American Express Chile Limitada,Chile,
+4962-0001193125-10-041232,Amex Inmobiliaria Limitada,Chile,
+4962-0001193125-10-041232,"Bansamex, S.A.",Spain,
+4962-0001193125-10-041232,Cardmember Financial Services Limited,Jersey,
+4962-0001193125-10-041232,"Cavendish Holdings, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,"Drillamex, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,FRC West Property. LLC,(USA) Arizona,
+4962-0001193125-10-041232,"Harbor Payments, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,"Fiware Holdings, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,Harbor Payments Corporation,(USA) Georgia,
+4962-0001193125-10-041232,Southern Africa Travellers Cheque Company (Pty) Ltd,South Africa,
+4962-0001193125-10-041232,Swiss Bankers Prepaid Services AG,Switzerland,
+4962-0001193125-10-041232,"Travel Impressions, Ltd.",(USA) Delaware,
+4962-0001193125-10-041232,Travellers Cheque Associates Limited,United Kingdom,
+4962-0001193125-10-041232,AMEX Assurance Company,(USA) Illinois,
+4962-0001193125-10-041232,Amexco Insurance Company,(USA) Vermont,
+4962-0001193125-10-041232,"National Express Company, Inc.",(USA) New York,
+4962-0001193125-10-041232,"The Balcor Company Holdings, Inc.",(USA) Delaware,
+4962-0001193125-10-041232,The Balcor Company,(USA) Delaware,
+4962-0001193125-10-041232,"Rexport, Inc.",(USA) Delaware,
+5981-0001193125-12-106666,AMVAC Chemical Corporation,California,
+5981-0001193125-12-106666,"GemChem, Inc.",California,
+5981-0001193125-12-106666,2110 Davie Corporation (formerly ABSCO Distributing),California,
+5981-0001193125-12-106666,AMVAC Chemical UK Ltd.*,"Surrey, England",
+5981-0001193125-12-106666,AMVAC Chemical GmbH,Switzerland,
+5981-0001193125-12-106666,AMVAC do Brasil Representácoes Ltda,Brasil,
+5981-0001193125-12-106666,"Agroservicios Amvac, SA de CV",Mexico,
+5981-0001193125-12-106666,Quimica Amvac de Mexico SA de CV,Mexico,
+5981-0001193125-12-106666,AMVAC de Costa Rica Sociedad Anonima,Costa Rica,
+5981-0001193125-12-106666,"Environmental Mediation, Inc.",California,
+5981-0001193125-12-106666,Calhart Corporation,California,
+5981-0001193125-12-106666,"Manufacturers Mirror & Glass Co., Inc.",California,
+5981-0001193125-12-106666,Todagco (80%)*,California,
+5981-0001193125-12-106666,American Vanguard Corporation of Imperial Valley (90%)*,California,
+5981-0001193125-12-106666,AMVAC Ag-Chem*,California,
+5981-0001193125-12-106666,AMVAC Chemical Corporation-Nevada*,Nevada,
+11199-0001104659-06-016718,"Bemis Company, Inc. (the “Registrant”)",Missouri,
+11199-0001104659-06-016718,"Banner Packaging, Inc.",Wisconsin,100.0
+11199-0001104659-06-016718,"Bemis Clysar, Inc.",Minnesota,100.0
+11199-0001104659-06-016718,"Bemis Czech Republic, s.r.o.",Czech Republic,100.0
+11199-0001104659-06-016718,Bemis Deutschland Holdings GmbH,Germany,100.0
+11199-0001104659-06-016718,Bemis Packaging Deutschland GmbH,Germany,100.0
+11199-0001104659-06-016718,"Bemis Europe Holdings, S.A.",Belgium,100.0
+11199-0001104659-06-016718,Bemis Monceau S.A.,Belgium,100.0
+11199-0001104659-06-016718,Techy France S.A.R.L.,France,100.0
+11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,100.0
+11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,100.0
+11199-0001104659-06-016718,Bemis France Holdings S.A.S.,France,100.0
+11199-0001104659-06-016718,Bemis Packaging France S.A.S.,France,100.0
+11199-0001104659-06-016718,Bemis Le Trait S.A.S.,France,100.0
+11199-0001104659-06-016718,Bemis Epernon S.A.S.,France,100.0
+11199-0001104659-06-016718,Bemis Hungary Trading Limited Liability Company,Hungary,100.0
+11199-0001104659-06-016718,Bemis Packaging Danmark ApS,Denmark,100.0
+11199-0001104659-06-016718,Bemis Packaging Italia S.r.l,Italy,100.0
+11199-0001104659-06-016718,Bemis Packaging Sverige A.B.,Sweden,100.0
+11199-0001104659-06-016718,Bemis Packaging U.K. Ltd.,United Kingdom,100.0
+11199-0001104659-06-016718,Bemis Valkeakoski Oy,Finland,100.0
+11199-0001104659-06-016718,Bolsas Bemis S.A. de C.V.,Mexico,51.0
+11199-0001104659-06-016718,Bolsas Bemis Servicios Mexico S.A. de C.V.,Mexico,51.0
+11199-0001104659-06-016718,"Curwood, Inc.",Delaware,100.0
+11199-0001104659-06-016718,Curwood Packaging (Canada) Limited,Canada,100.0
+11199-0001104659-06-016718,Bemis Packaging Ireland Limited,Ireland,100.0
+11199-0001104659-06-016718,Bemis Swansea Limited,United Kingdom,100.0
+11199-0001104659-06-016718,Bemis Packaging Espana sl,Spain,100.0
+11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,22.0
+11199-0001104659-06-016718,"Perfecseal, Inc.",Delaware,100.0
+11199-0001104659-06-016718,"Perfecseal Internacional de Puerto Rico, Inc.",Delaware,100.0
+11199-0001104659-06-016718,Perfecseal International Ltd.,Delaware,100.0
+11199-0001104659-06-016718,Perfecseal Limited,United Kingdom,100.0
+11199-0001104659-06-016718,Bemis Asia Pacific Sdn Bhd,Malaysia,100.0
+11199-0001104659-06-016718,"DEMF DT Holdings I, LLC",Delaware,100.0
+11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,23.0
+11199-0001104659-06-016718,Hayco Liquidation Company,Delaware,100.0
+11199-0001104659-06-016718,Bemis U.K. Limited,United Kingdom,50.0
+11199-0001104659-06-016718,"MacKay, Inc.",Kentucky,100.0
+11199-0001104659-06-016718,"Milprint, Inc.",Wisconsin,100.0
+11199-0001104659-06-016718,"Curwood Specialty Films – Lebanon, Inc.",Delaware,100.0
+11199-0001104659-06-016718,Misbe Participacoes Ltda.,Brazil,100.0
+11199-0001104659-06-016718,SH Participacoes S.A.,Brazil,100.0
+11199-0001104659-06-016718,DT Participacoes S.A.,Brazil,76.0
+11199-0001104659-06-016718,Dixie Toga S.A.,Brazil,92.0
+11199-0001104659-06-016718,DT Participacoes S.A.,Brazil,24.0
+11199-0001104659-06-016718,Dixie Toga S.A.,Brazil,8.0
+11199-0001104659-06-016718,American Packaging S.A.,Argentina,98.0
+11199-0001104659-06-016718,American Plast S.A.,Argentina,60.0
+11199-0001104659-06-016718,Dixie Toga International Ltd.,Cayman Islands,100.0
+11199-0001104659-06-016718,Dixie Toga Centro-Oeste Embalagens S.A.,Brazil,100.0
+11199-0001104659-06-016718,Dixie Toga Nordeste S.A.,Brazil,100.0
+11199-0001104659-06-016718,Impressora Paranaense S.A.,Brazil,100.0
+11199-0001104659-06-016718,Insit Embalagens Ltda.,Brazil,90.0
+11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,55.0
+11199-0001104659-06-016718,Itap Bemis Centro Oeste-Industria e Comércio de Embalagens Ltda.,Brazil,100.0
+11199-0001104659-06-016718,Curwood Chile Ltda.,Chile,100.0
+11199-0001104659-06-016718,Laminor S.A.,Brazil,50.0
+11199-0001104659-06-016718,M&W Toga Industria e Comércio S.A.,Brazil,60.0
+11199-0001104659-06-016718,Morgan Adhesives Company,Ohio,100.0
+11199-0001104659-06-016718,Bemis Coordination Center S.A.,Belgium,33.0
+11199-0001104659-06-016718,Bemis U.K. Limited,United Kingdom,50.0
+11199-0001104659-06-016718,MACtac U.K. Limited,United Kingdom,100.0
+11199-0001104659-06-016718,"Electronic Printing Products, Inc.",Ohio,100.0
+11199-0001104659-06-016718,Enterprise Software Inc.,Ohio,100.0
+11199-0001104659-06-016718,"MACtac Engineered Products, Inc.",Ohio,100.0
+29644-0001628280-16-019746,"Aerospace Filtration Systems, Inc.","Chesterfield, MO USA",
+29644-0001628280-16-019746,ASHC LLC,"Minneapolis, MN USA",
+29644-0001628280-16-019746,DLX Capital S.a.r.l.,"Luxembourg City, Luxembourg",
+29644-0001628280-16-019746,DLX USD FIN CO. S.a.r.l.,"Luxembourg City, Luxembourg",
+29644-0001628280-16-019746,"Donaldson (China) Holding Co., Ltd","Shanghai, China",
+29644-0001628280-16-019746,"Donaldson (China) Trading Co., Ltd","Wuxi, China",
+29644-0001628280-16-019746,Donaldson (Thailand) Ltd.,"Rayong, Thailand",
+29644-0001628280-16-019746,"Donaldson (Wuxi) Filters Co., Ltd.","Wuxi, China",
+29644-0001628280-16-019746,Donaldson (Xuzhou) Filters Co. Ltd.,"Xuzhou, China",
+29644-0001628280-16-019746,Donaldson Australasia Pty. Ltd,"Wyong, Australia",
+29644-0001628280-16-019746,"Donaldson Belgie, b.v.b.a.","Leuven, Belgium",
+29644-0001628280-16-019746,"Donaldson Canada, Inc","Brockville, Ontario, Canada",
+29644-0001628280-16-019746,"Donaldson Capital, Inc.","Minneapolis, MN USA",
+29644-0001628280-16-019746,"Donaldson Chile, Ltd.","Santiago, Chile",
+29644-0001628280-16-019746,Donaldson Columbia S.A.S.,"Bogotá, Columbia",
+29644-0001628280-16-019746,Donaldson Czech Republic s.r.o,"Klasterec nad Ohri, Czech Republic",
+29644-0001628280-16-019746,Donaldson do Brasil Equipamentos Industriais Ltda,"Atibaia, São Paulo, Brazil",
+29644-0001628280-16-019746,"Donaldson Europe, b.v.b.a.","Leuven, Belgium",
+29644-0001628280-16-019746,Donaldson Far East Ltd.,"Hong Kong, S.A.R., China",
+29644-0001628280-16-019746,Donaldson Filter Components Ltd.,"Hull, United Kingdom",
+29644-0001628280-16-019746,Donaldson Filtration (Asia Pacific) Pte. Ltd.,"Changi, Singapore",
+29644-0001628280-16-019746,Donaldson Filtration (GB) Ltd.,"Leicester, United Kingdom",
+29644-0001628280-16-019746,Donaldson Filtration (Malaysia) Sdn. Bhd.,"Selangor Darul Ehsan, Malaysia",
+29644-0001628280-16-019746,Donaldson Filtration (Thailand) Ltd.,"Nonthaburi, Thailand",
+29644-0001628280-16-019746,Donaldson Filtration Deutschland GmbH,"Haan, Germany",
+29644-0001628280-16-019746,Donaldson Filtration Magyarorszag Kft,"Budapest, Hungary",
+29644-0001628280-16-019746,Donaldson Filtration Norway a.s.,"Moss, Norway",
+29644-0001628280-16-019746,"Donaldson Filtration Österreich, GmbH","Vienna, Austria",
+29644-0001628280-16-019746,Donaldson Filtration Slovensko s.r.o.,"Bratislava, Slovakia",
+29644-0001628280-16-019746,Donaldson Filtration Systems (Pty) Ltd.,"Cape Town, South Africa",
+29644-0001628280-16-019746,Donaldson Filtre Sistemleri,"Istanbul, Turkey",
+29644-0001628280-16-019746,"Donaldson France, s.a.s.","Paris, France",
+29644-0001628280-16-019746,Donaldson Ibèrica Soluciones,"Barcelona, Spain",
+29644-0001628280-16-019746,Donaldson India Filter Systems Pvt. Ltd.,"New Delhi, India",
+29644-0001628280-16-019746,Donaldson Industrial CR - Konzern s.r.o.,"Kadan, Czech Republic",
+29644-0001628280-16-019746,Donaldson Italia s.r.l.,"Ostiglia, Italy",
+29644-0001628280-16-019746,"Donaldson Korea Co., Ltd.","Seoul, South Korea",
+29644-0001628280-16-019746,Donaldson Luxembourg S.a.r.l,"Luxembourg City, Luxembourg",
+29644-0001628280-16-019746,Donaldson Nederland B.V.,"Almere, Netherlands",
+29644-0001628280-16-019746,Donaldson Overseas Holding S.a.r.l.,"Luxembourg City, Luxembourg",
+29644-0001628280-16-019746,Donaldson Peru SAC,"Lima, Peru",
+29644-0001628280-16-019746,Donaldson Polska Sp. z.o.o.,"Warsaw, Poland",
+29644-0001628280-16-019746,Donaldson Scandinavia a.p.s.,"Hørsholm, Denmark",
+29644-0001628280-16-019746,Donaldson Schweiz GmbH,"Zurich, Switzerland",
+29644-0001628280-16-019746,Donaldson Taiwan Ltd.,"Taipei, Taiwan",
+29644-0001628280-16-019746,Donaldson UK Holding Ltd.,"Hull, United Kingdom",
+29644-0001628280-16-019746,"Donaldson, S.A. de C.V.","Aguascalientes, Mexico",
+29644-0001628280-16-019746,"Donaldson, s.a.s.","Domjean, France",
+29644-0001628280-16-019746,"Le Bozec Filtration et Systèmes, s.a.s.","Paris, France",
+29644-0001628280-16-019746,Filtros Partmo S.A.S.,"Bogotá, Columbia",
+29644-0001628280-16-019746,Nippon Donaldson Ltd.,"Tachikawa, Tokyo, Japan",
+29644-0001628280-16-019746,"Northern Technical, L.L.C.","Abu Dhabi, United Arab Emirates",
+29644-0001628280-16-019746,P.T. Donaldson Filtration Indonesia,"Jakarta, Indonesia",
+29644-0001628280-16-019746,"Prestadora de Servicios Aguascalientes, S. de R.L. de C.V.","Aguascalientes, Mexico",
+29644-0001628280-16-019746,Ultrafilter s.a.s.,"Vigny, France",
+29644-0001628280-16-019746,Advanced Filtration Systems Inc.,"Champaign, IL USA",
+29644-0001628280-16-019746,AFSI Europe s.r.o.,"Most, Czech Republic",
+29644-0001628280-16-019746,"IFIL.USA, L.L.C.","Harrisonville, MO USA",
+29644-0001628280-16-019746,P.T. Panata Jaya Mandiri,"Jakarta, Indonesia",
+29644-0001628280-16-019746,Rashed Al-Rashed & Sons - Donaldson Company Ltd.,"Dammam, Saudi Arabia",
+38723-0000038723-09-000029,"Franklin Securities, Inc.",Georgia,
+38723-0000038723-09-000029,Frandisco Property and Casualty Insurance Company,Georgia,
+38723-0000038723-09-000029,Frandisco Life Insurance Company,Georgia,
+38723-0000038723-09-000029,"T&T, Inc.",Georgia,
+320340-0000950123-10-027168,ChemFree Corporation,Georgia,
+320340-0000950123-10-027168,"CoreCard Software, Inc.",Delaware,
+320340-0000950123-10-027168,CoreCard SRL,Romania,
+320340-0000950123-10-027168,ISC Software Pvt. Ltd.,India,
+716646-0000950135-06-004150,Clinical Data BV,The Netherlands,
+716646-0000950135-06-004150,Clinical Data Incorporated,Massachusetts,
+716646-0000950135-06-004150,"Clinical Data Sales & Service, Inc.",Delaware,
+716646-0000950135-06-004150,Electa Lab s.r.l.,Italy,
+716646-0000950135-06-004150,"Genaissance Pharmaceuticals, Inc.",Delaware,
+716646-0000950135-06-004150,Genome Express S.A.,France,
+716646-0000950135-06-004150,"GPSI Acquisition, Inc.",Delaware,
+716646-0000950135-06-004150,"Icoria, Inc.",Delaware,
+716646-0000950135-06-004150,"Lark Technologies, Inc.",Delaware,
+716646-0000950135-06-004150,NovaChem BV,The Netherlands,
+716646-0000950135-06-004150,Spectronetics NV,"Curaçao, Netherlands Antilles",
+716646-0000950135-06-004150,Vital Scientific NV,The Netherlands,
+716646-0000950135-06-004150,Vital Diagnostics Pty. Ltd.,Australia,
+716646-0000950135-06-004150,Vital Diagnostics Ltd,New Zealand,
+719402-0001193125-14-113892,"First Bank, Inc.",Virginia,
+719402-0001193125-14-113892,"First Bank Financial Services, Inc.",Virginia,
+719402-0001193125-14-113892,"Shen-Valley Land Holdings, LLC",Virginia,
+719402-0001193125-14-113892,First National (VA) Statutory Trust II,Delaware,
+719402-0001193125-14-113892,First National (VA) Statutory Trust III,Delaware,
+749660-0001193125-12-104800,"Xoft, Inc.",Delaware,
+56679-0001193125-16-634657,Korn Ferry International S.A.,Argentina,
+56679-0001193125-16-634657,Korn Ferry Futurestep Argentina S.R.L.,Argentina,
+56679-0001193125-16-634657,Korn/Ferry International Pty Limited,Australia,
+56679-0001193125-16-634657,Futurestep (Australia) Pty Ltd,Australia,
+56679-0001193125-16-634657,Korn/Ferry International GmbH,Austria,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (Belgium) BVBA,Belgium,
+56679-0001193125-16-634657,"Personnel Decisions International Belgium, BVBA",Belgium,
+56679-0001193125-16-634657,Korn/Ferry International Consultoria Ltda.,Brazil,
+56679-0001193125-16-634657,"Korn/Ferry Canada, Inc.",Canada,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (Canada) Inc.,Canada,
+56679-0001193125-16-634657,Korn/Ferry International S.A.,Chile,
+56679-0001193125-16-634657,Korn/Ferry International Human Capital Consulting (Beijing) Limited,"Beijing, China",
+56679-0001193125-16-634657,Guangzhou Korn/Ferry Human Capital Company Ltd.,"Guangzhou, China",
+56679-0001193125-16-634657,"Korn/Ferry (Shanghai) Human Capital Consulting Co., Ltd.","Shanghai, China",
+56679-0001193125-16-634657,PuDe Management Consulting Co. Ltd.,"Shanghai, China",
+56679-0001193125-16-634657,Futurestep (Shanghai) Talent Consulting Company Limited,China,
+56679-0001193125-16-634657,Korn/Ferry International — Colombia,Colombia,
+56679-0001193125-16-634657,Korn/Ferry International A/S,Denmark,
+56679-0001193125-16-634657,Korn/Ferry International SAS,France,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (France) SARL,France,
+56679-0001193125-16-634657,Personnel Decisions International France SAS,France,
+56679-0001193125-16-634657,Korn/Ferry International GmbH,Germany,
+56679-0001193125-16-634657,Futurestep Germany GmbH,Germany,
+56679-0001193125-16-634657,Korn/Ferry International SA,Greece,
+56679-0001193125-16-634657,Korn/Ferry International (H.K.) Limited,Hong Kong,
+56679-0001193125-16-634657,Futurestep (Hong Kong) Ltd.,Hong Kong,
+56679-0001193125-16-634657,Korn/Ferry International Budapest Personnel Consulting and Service Ltd.,Hungary,
+56679-0001193125-16-634657,"PDI Hungary, Kft.",Hungary,
+56679-0001193125-16-634657,Korn/Ferry International Private Limited,India,
+56679-0001193125-16-634657,Futurestep Recruitment Services Private Limited.,India,
+56679-0001193125-16-634657,Personnel Decisions International India Pvt. Limited,India,
+56679-0001193125-16-634657,PT. Korn/Ferry International,Indonesia,
+56679-0001193125-16-634657,Korn/Ferry International S.R.L.,Italy,
+56679-0001193125-16-634657,Futurestep (Italia) S.r.l.,Italy,
+56679-0001193125-16-634657,Nihon Korn/Ferry International K.K.,Japan,
+56679-0001193125-16-634657,Futurestep (Japan) K.K.,Japan,
+56679-0001193125-16-634657,Korn Ferry Consulting — Japan,Japan,
+56679-0001193125-16-634657,Korn/Ferry International (Korea) Limited,Korea,
+56679-0001193125-16-634657,Agensi Pekerjaan Futurestep Worldwide (M) Sdn. Bhd.,Malaysia,
+56679-0001193125-16-634657,Korn/Ferry International (M) Sdn. Bhd.,Malaysia,
+56679-0001193125-16-634657,Korn/Ferry Investment India Limited (Mauritius OCB),Mauritius,
+56679-0001193125-16-634657,Korn/Ferry Mexico S.C.,Mexico,
+56679-0001193125-16-634657,Korn Ferry International B.V.,Netherlands,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (Holdings) B.V.,Netherlands,
+56679-0001193125-16-634657,Korn Ferry International NZ Limited,New Zealand,
+56679-0001193125-16-634657,Futurestep (New Zealand) Ltd.,New Zealand,
+56679-0001193125-16-634657,Korn/Ferry International A/S,Norway,
+56679-0001193125-16-634657,Korn/Ferry International — Peru S.A.,Peru,
+56679-0001193125-16-634657,Korn/Ferry International Sp.z.o.o.,Poland,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (POLSKA) Sp.z.o.o.,Poland,
+56679-0001193125-16-634657,Korn/Ferry International Pte. Ltd.,Singapore,
+56679-0001193125-16-634657,Futurestep (Singapore) Pte Limited,Singapore,
+56679-0001193125-16-634657,"PDI Slovensko, sro",Slovakia,
+56679-0001193125-16-634657,Korn/Ferry International S.A.,Spain,
+56679-0001193125-16-634657,"Futurestep (Espana), S.L.",Spain,
+56679-0001193125-16-634657,Korn/Ferry International AB,Sweden,
+56679-0001193125-16-634657,Personnel Decisions International Scandinavia A.B.,Sweden,
+56679-0001193125-16-634657,Korn-Ferry (Schweiz) AG,Switzerland,
+56679-0001193125-16-634657,Korn/Ferry International (Taiwan) Co. Limited,Taiwan,
+56679-0001193125-16-634657,Korn/Ferry International Musavirlik Limited Sirketi,Turkey,
+56679-0001193125-16-634657,Futurestep (UK) Limited,United Kingdom,
+56679-0001193125-16-634657,Korn/Ferry International Limited,United Kingdom,
+56679-0001193125-16-634657,KFI (UK) Limited,United Kingdom,
+56679-0001193125-16-634657,The Whitehead Mann Partnership LLP,United Kingdom,
+56679-0001193125-16-634657,Whitehead Mann Limited,United Kingdom,
+56679-0001193125-16-634657,"Personnel Decisions International, Europe Limited",United Kingdom,
+56679-0001193125-16-634657,Personnel Decisions International UK Ltd,United Kingdom,
+56679-0001193125-16-634657,Korn Ferry Global Holdings (UK) Limited,United Kingdom,
+56679-0001193125-16-634657,Korn Ferry GH1 Limited,United Kingdom,
+56679-0001193125-16-634657,"Pivot Learning, Limited",United Kingdom,
+56679-0001193125-16-634657,Continental American Management Corp.,"United States, California",
+56679-0001193125-16-634657,Korn/Ferry International Holding India,"United States, California",
+56679-0001193125-16-634657,"Korn/Ferry International Futurestep, Inc.","United States, Delaware",
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (Holdings) Inc.,"United States, Delaware",
+56679-0001193125-16-634657,"Korn/Ferry International Worldwide, Inc.","United States, Delaware",
+56679-0001193125-16-634657,"K/FI Canada Holdings, LLC","United States, Delaware",
+56679-0001193125-16-634657,Korn Ferry Leadership Consulting Corporation,"United States, Delaware",
+56679-0001193125-16-634657,"Ninth House, Inc.","United States, Delaware",
+56679-0001193125-16-634657,"Korn Ferry Global Holdings, Inc.","United States, Delaware",
+56679-0001193125-16-634657,Personnel Decisions International Greater China Corporation,"United States, Minnesota",
+56679-0001193125-16-634657,Personnel Decisions International Singapore Corporation,"United States, Minnesota",
+56679-0001193125-16-634657,"Sensa Solutions, Inc.","United States, Virginia",
+56679-0001193125-16-634657,"Korn/Ferry International Consultores Asociados, C.A.",Venezuela,
+56679-0001193125-16-634657,"Hay Group Holdings, Inc.","United States, Delaware",
+56679-0001193125-16-634657,"Hay Group International, Inc.","United States, Delaware",
+56679-0001193125-16-634657,"Hay Group, Inc.","United States, Delaware",
+56679-0001193125-16-634657,"Hay Group Management, Inc.","United States, Delaware",
+56679-0001193125-16-634657,Hay Group Limited,Canada,
+56679-0001193125-16-634657,Hay Group N.V./S.A.,Belgium,
+56679-0001193125-16-634657,Hay Group Czech s.r.o.,Czech Republic,
+56679-0001193125-16-634657,Hay Group Oy,Finland,
+56679-0001193125-16-634657,Hay Group S.A.,France,
+56679-0001193125-16-634657,Hay France S.A.,France,
+56679-0001193125-16-634657,Hay Group GmbH,Germany,
+56679-0001193125-16-634657,Hay Group S.A.,Greece,
+56679-0001193125-16-634657,Hay Group Management Consultants Ltd.,Hungary,
+56679-0001193125-16-634657,Hay Group (Ireland) Limited,Ireland,
+56679-0001193125-16-634657,Hay Management Consultants Ireland Ltd.,Ireland,
+56679-0001193125-16-634657,Hay Group S.r.l.,Italy,
+56679-0001193125-16-634657,Hay Group UAB,Lithuania,
+56679-0001193125-16-634657,HG (Luxembourg) S.a.r.l.,Luxembourg,
+56679-0001193125-16-634657,Talent Q International Ltd.,Malta,
+56679-0001193125-16-634657,Talent Q Distribution Ltd.,Malta,
+56679-0001193125-16-634657,Hay Group B.V.,Netherlands,
+56679-0001193125-16-634657,Hay Group Investment Holding B.V.,Netherlands,
+56679-0001193125-16-634657,Hay Management International B.V.,Netherlands,
+56679-0001193125-16-634657,Hay Group Partners Holding B.V.,Netherlands,
+56679-0001193125-16-634657,Hay Group AS,Norway,
+56679-0001193125-16-634657,Hay Group Sp.Z o.o,Poland,
+56679-0001193125-16-634657,Hay Group S.A.,Portugal,
+56679-0001193125-16-634657,Hay Group LLC,Qatar,
+56679-0001193125-16-634657,Hay Group Management Consultants SRL,Romania,
+56679-0001193125-16-634657,OOO Hay Group (Hay Group Ltd.),Russia,
+56679-0001193125-16-634657,Hay Group Saudi Arabia Ltd.,Saudi Arabia,
+56679-0001193125-16-634657,Hay Group s.r.o.,Slovakia,
+56679-0001193125-16-634657,Hay Group South Africa (Pty) Ltd.,South Africa,
+56679-0001193125-16-634657,Hay Group S.A.,Spain,
+56679-0001193125-16-634657,Hay Group AB,Sweden,
+56679-0001193125-16-634657,Hay Group Danismanlik Limited Sirketi,Turkey,
+56679-0001193125-16-634657,Hay Group LLC,Ukraine,
+56679-0001193125-16-634657,The Hay Group Management Limited,United Kingdom,
+56679-0001193125-16-634657,Hay Group UK Holdings Limited,United Kingdom,
+56679-0001193125-16-634657,Hay Group Intermediary Limited,United Kingdom,
+56679-0001193125-16-634657,Talent Q Services Limited,United Kingdom,
+56679-0001193125-16-634657,Talent Q Limited,United Kingdom,
+56679-0001193125-16-634657,Hay Group Pty. Limited,Australia,
+56679-0001193125-16-634657,"Hay Group Co., (Shanghai) Ltd.",China,
+56679-0001193125-16-634657,Hay Group Limited,Hong Kong,
+56679-0001193125-16-634657,Hay Group Asia Limited,Hong Kong,
+56679-0001193125-16-634657,Hay Consultants India Private Ltd.,India,
+56679-0001193125-16-634657,Talent Q India Private Ltd.,India,
+56679-0001193125-16-634657,PT Hay Group,Indonesia,
+56679-0001193125-16-634657,"Hay Group (Japan), Ltd.",Japan,
+56679-0001193125-16-634657,Hay Group Sdn. Bhd.,Malaysia,
+56679-0001193125-16-634657,Hay Group Limited,New Zealand,
+56679-0001193125-16-634657,Hay Group Pte Ltd.,Singapore,
+56679-0001193125-16-634657,Hay Group Ltd.,South Korea,
+56679-0001193125-16-634657,Hay Group Limited,Thailand,
+56679-0001193125-16-634657,Hay Group Consulting Limited Liability,Vietnam,
+56679-0001193125-16-634657,Hay Argentina S.A.,Argentina,
+56679-0001193125-16-634657,Hay do Brasil Consultores Ltda.,Brazil,
+56679-0001193125-16-634657,Hay GroupLimitada,Chile,
+56679-0001193125-16-634657,Hay Group Ltda,Colombia,
+56679-0001193125-16-634657,"Hay Group, S.R.L.",Costa Rica,
+56679-0001193125-16-634657,Hay Financial Corporation N.V.,Curacao,
+56679-0001193125-16-634657,Hay Group S.A. de C.V.,Mexico,
+56679-0001193125-16-634657,Hay Group S.A.,Peru,
+56679-0001193125-16-634657,"Hay Group Venezuela, S.A.",Venezuela,
+56679-0001193125-16-634657,Hay Management Consultants Limited,Bermuda,
+56679-0001193125-16-634657,HG (Bermuda) Holding Limited,Bermuda,
+56679-0001193125-16-634657,Korn Ferry GP Ventures LLC,"United States, Delaware",
+56679-0001193125-16-634657,Korn Ferry Global Ventures LP,United Kingdom,
+56679-0001193125-16-634657,Korn/Ferry International Futurestep (the Netherlands) BV,Netherlands,
+75829-0001206774-11-002167,Medsep Corporation,Delaware,
+75829-0001206774-11-002167,Pall Acquisition LLC,Delaware,
+75829-0001206774-11-002167,Pall Aeropower Corporation,Delaware,
+75829-0001206774-11-002167,"Pall Biomedical, Inc.",Delaware,
+75829-0001206774-11-002167,Pall Industrial Membranes LLC,Delaware,
+75829-0001206774-11-002167,"Pall Life Sciences Puerto Rico, LLC",Puerto Rico,
+75829-0001206774-11-002167,"Pall – PASS US, LLC",Delaware,
+75829-0001206774-11-002167,Russell Associates Inc.,Maryland,
+75829-0001206774-11-002167,"Gelman Sciences, Inc.",Michigan,
+75829-0001206774-11-002167,Pall Austria Filter GesmbH,Austria,
+75829-0001206774-11-002167,Pall (Canada) Limited,Canada,
+75829-0001206774-11-002167,Pall Do Brasil,Brazil,
+75829-0001206774-11-002167,Pall Europe Limited (a),England,
+75829-0001206774-11-002167,Pall France S.A.S.,France,
+75829-0001206774-11-002167,Pall Deutschland Beteiligungs GmbH,Germany,
+75829-0001206774-11-002167,Pall Deutschland Holding GmbH & Co. KG Partnership (c),Germany,
+75829-0001206774-11-002167,Pall Italia S.R.L.,Italy,
+75829-0001206774-11-002167,Pall Manufacturing UK Limited,England,
+75829-0001206774-11-002167,Gelman Ireland Ltd.,Ireland,
+75829-0001206774-11-002167,Pall Netherlands B.V. (a),The Netherlands,
+75829-0001206774-11-002167,PLLN C.V. Partnership (b),The Netherlands,
+75829-0001206774-11-002167,Pall Norge AS,Norway,
+75829-0001206774-11-002167,Pall Espana S.A.U.,Spain,
+75829-0001206774-11-002167,Pall Norden AB,Sweden,
+75829-0001206774-11-002167,Pall (Schweiz) A.G.,Switzerland,
+75829-0001206774-11-002167,Argentaurum A.G.,Switzerland,
+75829-0001206774-11-002167,Pall Asia International Ltd.,Hong Kong,
+75829-0001206774-11-002167,Pall India Private Ltd.,India,
+75829-0001206774-11-002167,PT Pall Filtration Indonesia,Indonesia,
+75829-0001206774-11-002167,Nihon Pall Ltd.,Japan,
+75829-0001206774-11-002167,Pall New Zealand Limited,New Zealand,
+75829-0001206774-11-002167,Pall Filtration Pte. Ltd.,Singapore,
+75829-0001206774-11-002167,Pall Singapore Taiwan Branch Holding Company Pte. Ltd.,Singapore,
+75829-0001206774-11-002167,Pall Korea Ltd.,South Korea,
+75829-0001206774-11-002167,Pall Corporation Filtration and Separations (Thailand) Ltd.,Thailand,
+75829-0001206774-11-002167,Pall Australia Pty LTD,Australia,
+89800-0000089800-18-000004,"Acquire Sourcing, LLC",DE,
+89800-0000089800-18-000004,"Comex North America, Inc.",DE,
+89800-0000089800-18-000004,Contract Transportation Systems Co.,DE,
+89800-0000089800-18-000004,CTS National Corporation,DE,
+89800-0000089800-18-000004,Omega Specialty Products & Services LLC,OH,
+89800-0000089800-18-000004,"Plasti-Kote Co., Inc.",OH,
+89800-0000089800-18-000004,"Sherwin-Williams Realty Holdings, Inc.",IL,
+89800-0000089800-18-000004,SWIMC LLC,DE,
+89800-0000089800-18-000004,The Sherwin-Williams Acceptance Corporation,NV,
+89800-0000089800-18-000004,The Sherwin-Williams Headquarters Company,OH,
+89800-0000089800-18-000004,The Sherwin-Williams Manufacturing Company,OH,
+89800-0000089800-18-000004,The Sherwin-Williams US Licensing Company,DE,
+89800-0000089800-18-000004,"Valspar Specialty Paints, LLC",DE,
+89800-0000089800-18-000004,"Compania Sherwin-Williams, S.A. de C.V.",Mexico,
+89800-0000089800-18-000004,Deep Pride Limited,Ireland,
+89800-0000089800-18-000004,Dongguan Lilly Paint Industries Ltd,China,
+89800-0000089800-18-000004,EPS B.V.,Netherlands,
+89800-0000089800-18-000004,EPS Polidrox Industria e Comercio de Resinas Ltda,Brazil,
+89800-0000089800-18-000004,"EPS (Shanghai) Trading Co., Ltd.",China,
+89800-0000089800-18-000004,Geocel Limited,UK,
+89800-0000089800-18-000004,Guangdong Valspar Paints Manufacturing Co Ltd.,China,
+89800-0000089800-18-000004,Guangdong Yuegang Dadi Paints Company Limited,China,
+89800-0000089800-18-000004,Guardsman Australia Pty Limited,Australia,
+89800-0000089800-18-000004,Guardsman Industries Limited,UK,
+89800-0000089800-18-000004,Invercolor Bologna Srl,Italy,
+89800-0000089800-18-000004,Invercolor Ltd,UK,
+89800-0000089800-18-000004,Invercolor Roma Srl,Italy,
+89800-0000089800-18-000004,Invercolor Torino Srl,Italy,
+89800-0000089800-18-000004,Invercolor Toscana Srl,Italy,
+89800-0000089800-18-000004,Inver East Med S.A.,Greece,
+89800-0000089800-18-000004,Inver France SAS,France,
+89800-0000089800-18-000004,Inver GmbH,Germany,
+89800-0000089800-18-000004,Inver Industrial Coating SRL,Romania,
+89800-0000089800-18-000004,Inver Polska Spóika Z O.O,Poland,
+89800-0000089800-18-000004,Inver Spa,Italy,
+89800-0000089800-18-000004,Isocoat Tintas e Vernizes Ltda,Brazil,
+89800-0000089800-18-000004,Isva Vernici Srl,Italy,
+89800-0000089800-18-000004,"Jiangsu Pulanna Coating Co., Ltd.",China,
+89800-0000089800-18-000004,Oy Sherwin-Williams Finland Ab,Finland,
+89800-0000089800-18-000004,Pinturas Condor S.A.,Ecuador,
+89800-0000089800-18-000004,Pinturas Industriales S.A.,Uruguay,
+89800-0000089800-18-000004,Plasti-kote Limited,UK,
+89800-0000089800-18-000004,"Productos Quimicos y Pinturas, S.A. de C.V.",Mexico,
+89800-0000089800-18-000004,PT Sherwin-Williams Indonesia,Indonesia,
+89800-0000089800-18-000004,PT Valspar Indonesia,Indonesia,
+89800-0000089800-18-000004,Quest Automotive Products UK Limited,UK,
+89800-0000089800-18-000004,"Quetzal Pinturas, S.A. de C.V.",Mexico,
+89800-0000089800-18-000004,Resin Surfaces Limited,UK,
+89800-0000089800-18-000004,Ronseal (Ireland) Limited,Ireland,
+89800-0000089800-18-000004,Sherwin-Williams Argentina I.y C.S.A.,Argentina,
+89800-0000089800-18-000004,Sherwin-Williams Aruba VBA,Aruba,
+89800-0000089800-18-000004,Sherwin-Williams (Australia) Pty. Ltd.,AU,
+89800-0000089800-18-000004,Sherwin-Williams Automotive Mexico S.de R.L.de C.V.,Mexico,
+89800-0000089800-18-000004,Sherwin-Williams Balkan S.R.L.,Romania,
+89800-0000089800-18-000004,Sherwin-Williams Bel,Belarus,
+89800-0000089800-18-000004,Sherwin-Williams (Belize) Limited,Belize,
+89800-0000089800-18-000004,Sherwin-Williams Benelux NV,Belgium,
+89800-0000089800-18-000004,Sherwin-Williams Canada Inc.,Canada,
+89800-0000089800-18-000004,Sherwin-Williams (Caribbean) N.V.,Curacao,
+89800-0000089800-18-000004,Sherwin-Williams Cayman Islands Limited,Grand Cayman,
+89800-0000089800-18-000004,Sherwin-Williams Chile S.A.,Chile,
+89800-0000089800-18-000004,Sherwin-Williams Coatings India Private Limited,India,
+89800-0000089800-18-000004,Sherwin-Williams Coatings S.a r.l.,Luxembourg,
+89800-0000089800-18-000004,Sherwin Williams Colombia S.A.S.,Columbia,
+89800-0000089800-18-000004,Sherwin-Williams Czech Republic spol. s r.o,Czech Republic,
+89800-0000089800-18-000004,Sherwin-Williams Denmark A/S,Denmark,
+89800-0000089800-18-000004,Sherwin-Williams Deutschland GmbH,Germany,
+89800-0000089800-18-000004,Sherwin-Williams Diversified Brands (Australia) Pty Ltd,Australia,
+89800-0000089800-18-000004,Sherwin-Williams Diversified Brands Limited,UK,
+89800-0000089800-18-000004,Sherwin-Williams do Brasil Industria e Comercio Ltda.,Brazil,
+89800-0000089800-18-000004,Sherwin-Williams France Finishes SAS,France,
+89800-0000089800-18-000004,Sherwin-Williams (Ireland) Limited,Ireland,
+89800-0000089800-18-000004,Sherwin-Williams Italy S.r.l.,Italy,
+89800-0000089800-18-000004,Sherwin-Williams Luxembourg Investment Management Company S.a r.l.,Luxembourg,
+89800-0000089800-18-000004,Sherwin-Williams (Malaysia) Sdn. Bhd.,Malaysia,
+89800-0000089800-18-000004,Sherwin-Williams (Nantong) Company Limited,China,
+89800-0000089800-18-000004,Sherwin-Williams Norway AS,Norway,
+89800-0000089800-18-000004,Sherwin-Williams Paints Limited Liability Company,Russia,
+89800-0000089800-18-000004,Sherwin-Williams Peru S.R.L.,Peru,
+89800-0000089800-18-000004,Sherwin-Williams Pinturas de Venezuela S.A.,Venezuela,
+89800-0000089800-18-000004,Sherwin-Williams Poland Sp. z o.o,Poland,
+89800-0000089800-18-000004,Sherwin-Williams Protective & Marine Coatings,UK,
+89800-0000089800-18-000004,Sherwin-Williams (S) Pte. Ltd.,Singapore,
+89800-0000089800-18-000004,Sherwin-Williams Services (Malaysia) Sdn. Bhd.,Malaysia,
+89800-0000089800-18-000004,Sherwin-Williams (Shanghai) Limited,China,
+89800-0000089800-18-000004,"Sherwin-Williams (South China) Co., Ltd.",China,
+89800-0000089800-18-000004,Sherwin-Williams Spain Coatings S.L.,Spain,
+89800-0000089800-18-000004,Sherwin-Williams Sweden AB,Sweden,
+89800-0000089800-18-000004,"Sherwin-Williams (Thailand) Co., Ltd.",Thailand,
+89800-0000089800-18-000004,Sherwin-Williams Uruguay S.A.,Uruguay,
+89800-0000089800-18-000004,Sherwin-Williams (Vietnam) Limited,Vietnam,
+89800-0000089800-18-000004,Sherwin-Williams (West Indies) Limited,Jamaica,
+89800-0000089800-18-000004,Spanyc Paints Joint Stock Company,Vietnam,
+89800-0000089800-18-000004,SWIPCO – Sherwin Williams do Brasil Propriedade,Brazil,
+89800-0000089800-18-000004,Syntema I Vaggeryd AB,Sweden,
+89800-0000089800-18-000004,"Taiwan Valspar Co., Ltd.",Taiwan,
+89800-0000089800-18-000004,The Valspar (Asia) Corporation Limited,Hong Kong,
+89800-0000089800-18-000004,The Valspar (Australia) Corporation Pty. Ltd.,Australia,
+89800-0000089800-18-000004,The Valspar Corporation Limitada,Brazil,
+89800-0000089800-18-000004,The Valspar (Finland) Corporation Oy,Finland,
+89800-0000089800-18-000004,The Valspar (France) Corporation S.A.S.,France,
+89800-0000089800-18-000004,The Valspar (France) Research Corporation SAS,France,
+89800-0000089800-18-000004,The Valspar (Germany) GmbH,Germany,
+89800-0000089800-18-000004,The Valspar (Malaysia) Corporation Sdn Bhd,Malaysia,
+89800-0000089800-18-000004,The Valspar (Nantes) Corporation S.A.S.,France,
+89800-0000089800-18-000004,The Valspar (Singapore) Corporation Pte. Ltd,Singapore,
+89800-0000089800-18-000004,The Valspar (South Africa) Corporation (Pty) Ltd,South Africa,
+89800-0000089800-18-000004,The Valspar (Spain) Corporation S.R.L.,Spain,
+89800-0000089800-18-000004,The Valspar (Switzerland) Corporation AG,Switzerland,
+89800-0000089800-18-000004,The Valspar (Thailand) Corporation Ltd.,Thailand,
+89800-0000089800-18-000004,The Valspar (UK) Corporation Limited,UK,
+89800-0000089800-18-000004,The Valspar (Vietnam) Corporation Ltd.,Vietnam,
+89800-0000089800-18-000004,TOB Becker Acroma Ukraine,Ukraine,
+89800-0000089800-18-000004,UAB Sherwin-Williams Baltic,Lithuania,
+89800-0000089800-18-000004,"Valspar Aries Coatings, S. de R.L. de C.V.",Mexico,
+89800-0000089800-18-000004,Valspar Automotive Australia Pty Limited,Australia,
+89800-0000089800-18-000004,Valspar Automotive (UK) Corporation Limited,UK,
+89800-0000089800-18-000004,Valspar B.V.,Netherlands,
+89800-0000089800-18-000004,"Valspar Coatings (Guangdong) Co., Ltd.",China,
+89800-0000089800-18-000004,Valspar Coatings (Shanghai) Co. Ltd.,China,
+89800-0000089800-18-000004,"Valspar Coatings (Tianjin) Co., Ltd",China,
+89800-0000089800-18-000004,Valspar D.o.o Beograd,Serbia,
+89800-0000089800-18-000004,Valspar Inc.,Canada,
+89800-0000089800-18-000004,Valspar (India) Coatings Corporation Private Limited,India,
+89800-0000089800-18-000004,Valspar Industries GmbH,Germany,
+89800-0000089800-18-000004,Valspar Industries (Ireland) Ltd.,Ireland,
+89800-0000089800-18-000004,Valspar Industries (Italy) S.r.l.,Italy,
+89800-0000089800-18-000004,Valspar LLC,Russia,
+89800-0000089800-18-000004,"Valspar Mexicana, S.A. de C.V.",Mexico,
+89800-0000089800-18-000004,Valspar Paint (Australia) Pty Ltd,Australia,
+89800-0000089800-18-000004,Valspar Paint (NZ) Limited,New Zealand,
+89800-0000089800-18-000004,Valspar Powder Coatings Limited,UK,
+89800-0000089800-18-000004,Valspar Rock Company Limited,Japan,
+89800-0000089800-18-000004,"Valspar (Shanghai) Management Co., Ltd.",China,
+89800-0000089800-18-000004,Vantaco Oy,Finland,
+89800-0000089800-18-000004,Valspar (Uruguay) Corporation S.A.,Uruguay,
+89800-0000089800-18-000004,Valspar (WPC) Pty Ltd,Australia,
+89800-0000089800-18-000004,ZAO Sherwin-Williams,Russia,
+799233-0000799233-13-000013,"Heartland Express, Inc.",NV,
+799233-0000799233-13-000013,"A&M Express, Inc.",TN,
+799233-0000799233-13-000013,"Heartland Express, Inc. of Iowa",IA,
+799233-0000799233-13-000013,"Heartland Express Maintenance Services, Inc.",NV,
+799233-0000799233-13-000013,"Heartland Express Services, Inc.",NV,
+804328-0001234452-15-000271,"Qualcomm Technologies, Inc.",Delaware,
+804328-0001234452-15-000271,Qualcomm Global Trading Pte. Ltd.,Singapore,
+804328-0001234452-15-000271,Qualcomm CDMA Technologies Asia-Pacific Pte. Ltd.,Singapore,
+804328-0001234452-15-000271,Qualcomm Asia Pacific Pte. Ltd.,Singapore,
+804328-0001234452-15-000271,"Qualcomm Atheros, Inc.",Delaware,
+804328-0001234452-15-000271,"Qualcomm Technologies International, Ltd.",United Kingdom,
+821127-0000821127-11-000003,Borel Private Bank & Trust Company,California,
+821127-0000821127-11-000003,Boston Private Bank & Trust Company,Massachusetts,
+821127-0000821127-11-000003,Charter Private Bank,Washington,
+821127-0000821127-11-000003,First Private Bank & Trust,California,
+821127-0000821127-11-000003,"Anchor Capital Holdings, LLC",Delaware,
+821127-0000821127-11-000003,"Bingham, Osborn, & Scarborough, LLC",California,
+821127-0000821127-11-000003,"Dalton, Greiner, Hartman, Maher & Co. LLC",Delaware,
+821127-0000821127-11-000003,"KLS Professional Advisors Group, LLC",Delaware,
+821127-0000821127-11-000003,"Davidson Trust Company, LLC",Pennsylvania,
+869495-0001213900-18-002720,Deep Well Oil & Gas (Alberta) Ltd.,"Alberta, Canada",
+869495-0001213900-18-002720,Northern Alberta Oil Ltd.,"Alberta, Canada",
+860546-0001104659-07-015618,"COPT Aerotech, LLC",,
+860546-0001104659-07-015618,"COPT Interquest, LLC",,
+860546-0001104659-07-015618,"COPT Interquest III, LLC",,
+860546-0001104659-07-015618,"COPT Interquest IV, LLC",,
+860546-0001104659-07-015618,"COPT Newport, LLC",,
+860546-0001104659-07-015618,"COPT Newport C, LLC",,
+860546-0001104659-07-015618,"COPT Newport D, LLC",,
+860546-0001104659-07-015618,"COPT Northcreek, LLC",,
+860546-0001104659-07-015618,"COPT Patriot Park at Galley, LLC",,
+860546-0001104659-07-015618,"COPT Patriot Park I, LLC",,
+860546-0001104659-07-015618,"COPT Patriot Park II, LLC",,
+860546-0001104659-07-015618,"Patriot Park, LLC",,
+860546-0001104659-07-015618,"Airport Square Holdings VI and VII, LLC",,
+860546-0001104659-07-015618,"Blue Bell Investment Company, LP",,
+860546-0001104659-07-015618,"COPT Acquisitions, Inc.",,
+860546-0001104659-07-015618,"COPT Colgate General, LLC",,
+860546-0001104659-07-015618,"COPT Concourse, LLC",,
+860546-0001104659-07-015618,"COPT Gateway, LP",,
+860546-0001104659-07-015618,"COPT Gateway Commerce, LLC",,
+860546-0001104659-07-015618,"Corporate Gateway, LP",,
+860546-0001104659-07-015618,"Corporate Office Properties, LP",,
+860546-0001104659-07-015618,"Corporate Office Properties Holdings, Inc.",,
+860546-0001104659-07-015618,"Crown Point, L.L.C.",,
+860546-0001104659-07-015618,"Delaware Airport III, LLC",,
+860546-0001104659-07-015618,"Delaware Airport VIII, LLC",,
+860546-0001104659-07-015618,"Delaware Airport IX, LLC",,
+860546-0001104659-07-015618,"Great Mills I, L.L.C.",,
+860546-0001104659-07-015618,"Great Mills II, L.L.C.",,
+860546-0001104659-07-015618,"Great Mills III, L.L.C.",,
+860546-0001104659-07-015618,"Great Mills IV, L.L.C.",,
+860546-0001104659-07-015618,"Great Mills V, L.L.C.",,
+860546-0001104659-07-015618,"Harrisburg Corporate Gateway Partners, LP",,
+860546-0001104659-07-015618,"Opportunity Invest Ventures, LLC",,
+860546-0001104659-07-015618,"Sterling York, LLC",,
+860546-0001104659-07-015618,"South Brunswick Investors, LP",,
+860546-0001104659-07-015618,"11800 Tech Road, LLC",,
+860546-0001104659-07-015618,"Aerotech Manager, LLC",,
+860546-0001104659-07-015618,"Airport Square, LLC",,
+860546-0001104659-07-015618,"Airport Square II, LLC",,
+860546-0001104659-07-015618,"Airport Square IV, LLC",,
+860546-0001104659-07-015618,"Airport Square V, LLC",,
+860546-0001104659-07-015618,"Airport Square X, LLC",,
+860546-0001104659-07-015618,"Airport Square XI, LLC",,
+860546-0001104659-07-015618,"Airport Square XIII, LLC",,
+860546-0001104659-07-015618,"Airport Square XIV, LLC",,
+860546-0001104659-07-015618,"Airport Square XV, LLC",,
+860546-0001104659-07-015618,"Airport Square XIX, LLC",,
+860546-0001104659-07-015618,"Airport Square XX, LLC",,
+860546-0001104659-07-015618,"Airport Square XX Parking, LLC",,
+860546-0001104659-07-015618,"Airport Square XXI, LLC",,
+860546-0001104659-07-015618,"Airport Square XXII, LLC",,
+860546-0001104659-07-015618,"Airport Square Partners, LLC",,
+860546-0001104659-07-015618,"Airport Square Storms, LLC",,
+860546-0001104659-07-015618,"Ambassador Center, LLC",,
+860546-0001104659-07-015618,"ASI, LLC",,
+860546-0001104659-07-015618,"Atrium Building, LLC",,
+860546-0001104659-07-015618,"Brown’s Wharf, LLC",,
+860546-0001104659-07-015618,Centerpointe Limited Partnership,,
+860546-0001104659-07-015618,"Clarks Hundred, LLC",,
+860546-0001104659-07-015618,"Columbia Gateway S-28, LLC",,
+860546-0001104659-07-015618,"Commons Office Research, LLC",,
+860546-0001104659-07-015618,"Commons Office 6-B, LLC",,
+860546-0001104659-07-015618,"Concourse 1304, LLC",,
+860546-0001104659-07-015618,"COPT Arundel Preserve, LLC",,
+860546-0001104659-07-015618,"COPT Baltimore County I, LLC",,
+860546-0001104659-07-015618,"COPT Baltimore County II, LLC",,
+860546-0001104659-07-015618,"COPT Development & Construction Services, LLC",,
+860546-0001104659-07-015618,COPT Environmental Systems LLC,,
+860546-0001104659-07-015618,"COPT Gate 63, LLC",,
+860546-0001104659-07-015618,"COPT Gate 6700-6708-6724, LLC",,
+860546-0001104659-07-015618,"COPT General, LLC",,
+860546-0001104659-07-015618,"COPT Hunt Valley GP, LLC",,
+860546-0001104659-07-015618,"COPT Montpelier, LLC",,
+860546-0001104659-07-015618,"COPT Opportunity Invest I, LLC",,
+860546-0001104659-07-015618,"COPT Property Management Services, LLC",,
+860546-0001104659-07-015618,"COPT Renovation, LLC",,
+860546-0001104659-07-015618,"COPT Riverwood, LLC",,
+860546-0001104659-07-015618,"COPT T-11, LLC",,
+860546-0001104659-07-015618,"COPT-FD Indian Head, LLC",,
+860546-0001104659-07-015618,"Corporate Development Services, LLC",,
+860546-0001104659-07-015618,"Corporate Gatespring, LLC",,
+860546-0001104659-07-015618,"Corporate Gatespring II, LLC",,
+860546-0001104659-07-015618,"Corporate Office Management, Inc.",,
+860546-0001104659-07-015618,"Corporate Office Services, LLC",,
+860546-0001104659-07-015618,"Corporate Paragon, LLC",,
+860546-0001104659-07-015618,"Corporate Property, LLC",,
+860546-0001104659-07-015618,"Cornucopia Holdings, LLC",,
+860546-0001104659-07-015618,"Cornucopia Holdings II, LLC",,
+860546-0001104659-07-015618,"Enterprise Campus Developer, LLC",,
+860546-0001104659-07-015618,"Fourth Exploration, L.L.C.",,
+860546-0001104659-07-015618,"Fifth Exploration, L.L.C.",,
+860546-0001104659-07-015618,"Ft. Ritchie I, LLC",,
+860546-0001104659-07-015618,"Ft. Ritchie II, LLC",,
+860546-0001104659-07-015618,"Ft. Ritchie III, LLC",,
+860546-0001104659-07-015618,"Ft. Ritchie IV, LLC",,
+860546-0001104659-07-015618,"Ft. Ritchie Holding, LLC",,
+860546-0001104659-07-015618,"Gateway 44, LLC",,
+860546-0001104659-07-015618,"Gateway 67, LLC",,
+860546-0001104659-07-015618,"Gateway 70, LLC",,
+860546-0001104659-07-015618,"Gateway 70 Holdings, LLC",,
+860546-0001104659-07-015618,"Gateway Crossing 95, LLC",,
+860546-0001104659-07-015618,"Governors Court, LLC",,
+860546-0001104659-07-015618,"Governors Court 21, LLC",,
+860546-0001104659-07-015618,"Honeyland 108, LLC",,
+860546-0001104659-07-015618,Hunt Valley 75 Limited Partnership,,
+860546-0001104659-07-015618,"Jolly COPT I, LLC",,
+860546-0001104659-07-015618,"Jolly COPT II, LLC",,
+860546-0001104659-07-015618,"M Square NOAA, LLC",,
+860546-0001104659-07-015618,"MOR Forbes, LLC",,
+860546-0001104659-07-015618,"MOR Forbes 2, LLC",,
+860546-0001104659-07-015618,"NBP One, LLC",,
+860546-0001104659-07-015618,"NBP Huff & Puff, LLC",,
+860546-0001104659-07-015618,"NBP Lot 3-A, LLC",,
+860546-0001104659-07-015618,"NBP Retail, LLC",,
+860546-0001104659-07-015618,"NBP 131-133-141, LLC",,
+860546-0001104659-07-015618,"NBP 132, LLC",,
+860546-0001104659-07-015618,"NBP 134, LLC",,
+860546-0001104659-07-015618,"NBP 135, LLC",,
+860546-0001104659-07-015618,"NBP 140, LLC",,
+860546-0001104659-07-015618,"NBP 191, LLC",,
+860546-0001104659-07-015618,"NBP 201, LLC",,
+860546-0001104659-07-015618,"NBP 201 Holdings, LLC",,
+860546-0001104659-07-015618,"NBP 211, LLC",,
+860546-0001104659-07-015618,"NBP 211 Holdings, LLC",,
+860546-0001104659-07-015618,"NBP 220, LLC",,
+860546-0001104659-07-015618,"NBP 220 Holdings, LLC",,
+860546-0001104659-07-015618,"NBP 221, LLC",,
+860546-0001104659-07-015618,"NBP 302, LLC",,
+860546-0001104659-07-015618,"NBP 304, LLC",,
+860546-0001104659-07-015618,"NBP 306, LLC",,
+860546-0001104659-07-015618,"NBP 318, LLC",,
+860546-0001104659-07-015618,"NBP 320, LLC",,
+860546-0001104659-07-015618,"NBP 322, LLC",,
+860546-0001104659-07-015618,"Northcreek Manager, LLC",,
+860546-0001104659-07-015618,"Pecan Court, L.L.C.",,
+860546-0001104659-07-015618,"Red Cedar Building, LLC",,
+860546-0001104659-07-015618,"RIVA Trustee, LLC",,
+860546-0001104659-07-015618,"Rockville Corporate Center, LLC",,
+860546-0001104659-07-015618,Rutherford 2 Limited Partnership,,
+860546-0001104659-07-015618,"Tech Park I, LLC",,
+860546-0001104659-07-015618,"Tech Park II, LLC",,
+860546-0001104659-07-015618,"Tech Park IV, LLC",,
+860546-0001104659-07-015618,"Third Exploration, L.L.C.",,
+860546-0001104659-07-015618,"67 Financing, LLC",,
+860546-0001104659-07-015618,"110 Thomas Johnson, LLC",,
+860546-0001104659-07-015618,"134, LLC",,
+860546-0001104659-07-015618,201 International Associates Limited Partnership,,
+860546-0001104659-07-015618,"226 Schilling Circle, LLC",,
+860546-0001104659-07-015618,"230 Schilling Circle, LLC",,
+860546-0001104659-07-015618,"304 Sentinel, LLC",,
+860546-0001104659-07-015618,"800 International, LLC",,
+860546-0001104659-07-015618,"849 International, LLC",,
+860546-0001104659-07-015618,"881 Elkridge Landing, LLC",,
+860546-0001104659-07-015618,"900 International, LLC",,
+860546-0001104659-07-015618,"930 International, LLC",,
+860546-0001104659-07-015618,"999 Corporate, LLC",,
+860546-0001104659-07-015618,"1099 Winterson, LLC",,
+860546-0001104659-07-015618,"1190 Winterson, LLC",,
+860546-0001104659-07-015618,"1199 Winterson, LLC",,
+860546-0001104659-07-015618,"1460 Dorsey Road, LLC",,
+860546-0001104659-07-015618,2500 Riva Trust,,
+860546-0001104659-07-015618,"2691 Technology, LLC",,
+860546-0001104659-07-015618,"2900 Lord Baltimore Drive, LLC",,
+860546-0001104659-07-015618,"6700 Alexander Bell, LLC",,
+860546-0001104659-07-015618,"6711 Gateway, LLC",,
+860546-0001104659-07-015618,"6711 Gateway Funding, LLC",,
+860546-0001104659-07-015618,"6721 Gateway, LLC",,
+860546-0001104659-07-015618,"6731 Gateway, LLC",,
+860546-0001104659-07-015618,"6741 Gateway, LLC",,
+860546-0001104659-07-015618,"6940 CGD, LLC",,
+860546-0001104659-07-015618,"7000 CG, LLC",,
+860546-0001104659-07-015618,"7000 Honeys, LLC",,
+860546-0001104659-07-015618,"7015 Albert Einstein Drive, LLC",,
+860546-0001104659-07-015618,"7130 Columbia Gateway, LLC",,
+860546-0001104659-07-015618,"7200 Riverwood, LLC",,
+860546-0001104659-07-015618,"7210 Ambassador Road, LLC",,
+860546-0001104659-07-015618,"7240 Parkway Drive Enterprises, LLC",,
+860546-0001104659-07-015618,"7253 Ambassador Road, LLC",,
+860546-0001104659-07-015618,"7318 Parkway Drive Enterprises, LLC",,
+860546-0001104659-07-015618,"7320 Parkway Drive Enterprises, LLC",,
+860546-0001104659-07-015618,"7320 PD, LLC",,
+860546-0001104659-07-015618,"7321 Parkway Drive Enterprises, LLC",,
+860546-0001104659-07-015618,"7468 Candlewood Road, LLC",,
+860546-0001104659-07-015618,"8621 RFD, LLC",,
+860546-0001104659-07-015618,"8661 RFD, LLC",,
+860546-0001104659-07-015618,"9690 Deereco Road, LLC",,
+860546-0001104659-07-015618,"11011 McCormick Road, LLC",,
+860546-0001104659-07-015618,"11101 McCormick Road, LLC",,
+860546-0001104659-07-015618,"COPT Princeton South, LLC",,
+860546-0001104659-07-015618,"Cuaba Associates, L.L.C.",,
+860546-0001104659-07-015618,"68 Culver, LLC",,
+860546-0001104659-07-015618,Route 46 Partners,,
+860546-0001104659-07-015618,"Route 46 Partners, L.L.C.",,
+860546-0001104659-07-015618,"Bolivar Associates, LLC",,
+860546-0001104659-07-015618,"Colgatedrive Associates, L.P.",,
+860546-0001104659-07-015618,"COPT Pennlyn, L.P.",,
+860546-0001104659-07-015618,"COPT San Antonio General, LLC",,
+860546-0001104659-07-015618,"COPT San Antonio, LP",,
+860546-0001104659-07-015618,"COPT Chantilly, LLC",,
+860546-0001104659-07-015618,"COPT Chantilly II, LLC",,
+860546-0001104659-07-015618,"COPT Dahlgren, LLC",,
+860546-0001104659-07-015618,"COPT Dahlgren I, LLC",,
+860546-0001104659-07-015618,"COPT Dahlgren II, LLC",,
+860546-0001104659-07-015618,"COPT Dahlgren IV, LLC",,
+860546-0001104659-07-015618,"COPT Dahlgren Land, LLC",,
+860546-0001104659-07-015618,"COPT Greens I, LLC",,
+860546-0001104659-07-015618,"COPT Greens II, LLC",,
+860546-0001104659-07-015618,"COPT Greens III, LLC",,
+860546-0001104659-07-015618,"COPT Park Meadow, LLC",,
+860546-0001104659-07-015618,"COPT Parkstone, LLC",,
+860546-0001104659-07-015618,"COPT Richmond I, LLC",,
+860546-0001104659-07-015618,"COPT Ridgeview I, LLC",,
+860546-0001104659-07-015618,"COPT Ridgeview II & III, LLC",,
+860546-0001104659-07-015618,"COPT Southwest VA, LLC",,
+860546-0001104659-07-015618,"COPT Stonecroft, LLC",,
+860546-0001104659-07-015618,"COPT Sunrise, LLC",,
+860546-0001104659-07-015618,"COPT Waterview I, LLC",,
+860546-0001104659-07-015618,"COPT Waterview III, LLC",,
+860546-0001104659-07-015618,"TRC Pinnacle Towers, L.L.C.",,
+860546-0001104659-07-015618,"2900 Towerview Road, LLC",,
+875622-0001140361-17-012337,Advance Biofactures Corp.,New York,
+891014-0000891014-11-000007,APP China Specialty Minerals Pte Ltd.,Singapore,
+891014-0000891014-11-000007,ASMAS Agir Sanayi Malzemeleri Imal ve Tic. A.S.,Turkey,
+891014-0000891014-11-000007,Barretts Minerals Inc.,Delaware,
+891014-0000891014-11-000007,ComSource Trading Ltd.,Delaware,
+891014-0000891014-11-000007,Gold Lun Chemicals (Zhenjiang).,China,
+891014-0000891014-11-000007,"Gold Sheng Chemicals (Zhenjiang) Co., Ltd.",China,
+891014-0000891014-11-000007,"Gold Zuan Chemicals (Suzhou) Co., Ltd.",China,
+891014-0000891014-11-000007,"Hi-Tech Specialty Minerals Company, Limited",Thailand,
+891014-0000891014-11-000007,Minerals Technologies do Brasil Comercio é Industria de Minerais Ltda.,Brazil,
+891014-0000891014-11-000007,Minerals Technologies Europe N.V.,Belgium,
+891014-0000891014-11-000007,Minerals Technologies Holdings Inc.,Delaware,
+891014-0000891014-11-000007,Minerals Technologies Holdings Ltd.,United Kingdom,
+891014-0000891014-11-000007,Minerals Technologies India Private Limited,India,
+891014-0000891014-11-000007,"Minerals Technologies Mexico Holdings, S. de R. L. de C.V.",Mexico,
+891014-0000891014-11-000007,Minerals Technologies South Africa (Pty) Ltd.,South Africa,
+891014-0000891014-11-000007,Mintech Canada Inc.,Canada,
+891014-0000891014-11-000007,Mintech Japan K.K.,Japan,
+891014-0000891014-11-000007,Minteq Australia Pty Ltd.,Australia,
+891014-0000891014-11-000007,Minteq B.V.,The Netherlands,
+891014-0000891014-11-000007,Minteq Europe Limited.,Ireland,
+891014-0000891014-11-000007,Minteq International GmbH,Germany,
+891014-0000891014-11-000007,Minteq International Inc.,Delaware,
+891014-0000891014-11-000007,"Minteq International (Suzhou) Co., Ltd.",China,
+891014-0000891014-11-000007,Minteq Italiana S.p.A.,Italy,
+891014-0000891014-11-000007,Minteq Korea Inc.,Korea,
+891014-0000891014-11-000007,Minteq Kosovo LLC.,Kosovo,
+891014-0000891014-11-000007,Minteq Magnesite Limited,Ireland,
+891014-0000891014-11-000007,"Minteq Metallurgical Materials (Suzhou) Co., Ltd.",China,
+891014-0000891014-11-000007,Minteq Shapes and Services Inc.,Delaware,
+891014-0000891014-11-000007,Minteq UK Limited.,United Kingdom,
+891014-0000891014-11-000007,MTI Bermuda L.P.,Bermuda,
+891014-0000891014-11-000007,MTI Holdings GmbH,Germany,
+891014-0000891014-11-000007,MTI Holding Singapore Pte. Ltd.,Singapore,
+891014-0000891014-11-000007,MTI Holdco I LLC,Delaware,
+891014-0000891014-11-000007,MTI Holdco II LLC,Delaware,
+891014-0000891014-11-000007,MTI Netherlands B.V.,Netherlands,
+891014-0000891014-11-000007,MTX Finance Inc.,Delaware,
+891014-0000891014-11-000007,MTX Finance Ireland,Ireland,
+891014-0000891014-11-000007,Performance Minerals Netherlands C.V.,Netherlands,
+891014-0000891014-11-000007,PT Sinar Mas Specialty Minerals,Indonesia,
+891014-0000891014-11-000007,"Rijnstaal U.S.A., Inc.",Pennsylvania,
+891014-0000891014-11-000007,SMI NewQuest India Private Limited SMI Poland Sp. z o.o.,India,
+891014-0000891014-11-000007,Specialty Minerals Benelux,Poland,
+891014-0000891014-11-000007,Specialty Minerals FMT K.K.,Belgium,
+891014-0000891014-11-000007,Specialty Minerals France s.p.a.s.,Japan,
+891014-0000891014-11-000007,Specialty Minerals GmbH,France,
+891014-0000891014-11-000007,Specialty Minerals Inc.,Germany,
+891014-0000891014-11-000007,Specialty Minerals India Holding Inc.,Delaware,
+891014-0000891014-11-000007,Specialty Minerals International Inc.,Delaware,
+891014-0000891014-11-000007,Specialty Minerals Malaysia Sdn. Bhd.,Malaysia,
+891014-0000891014-11-000007,Specialty Minerals (Michigan) Inc.,Michigan,
+891014-0000891014-11-000007,Specialty Minerals Mississippi Inc.,Delaware,
+891014-0000891014-11-000007,Specialty Minerals Nordic Oy Ab,Finland,
+891014-0000891014-11-000007,"Specialty Minerals (Portugal) Especialidades Minerais, S.A.",Portugal,
+891014-0000891014-11-000007,Specialty Minerals S.A. de C.V.,Mexico,
+891014-0000891014-11-000007,Specialty Minerals Servicios S. de R.L. de C.V.,Mexico,
+891014-0000891014-11-000007,"Specialty Minerals Slovakia, spol. sr.o.",Slovakia,
+891014-0000891014-11-000007,Specialty Minerals South Africa (Pty) Limited,South Africa,
+891014-0000891014-11-000007,Specialty Minerals (Thailand) Limited,Thailand,
+891014-0000891014-11-000007,Specialty Minerals UK Limited,United Kingdom,
+891014-0000891014-11-000007,"Tecnologias Minerales de Mexico, S.A. de C.V.",Mexico,
+891014-0000891014-11-000007,Yangpu Gold Hongda Chemicals Co. Ltd.,China,

From 581b2e31129e73ca040948ff4ea6cd6b7a7dd01a Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sun, 6 Oct 2024 10:13:19 -0400
Subject: [PATCH 095/161] Use run name for specifying training runs

---
 src/mozilla_sec_eia/models/sec10k/__init__.py  |  5 ++---
 .../sec10k/notebooks/exhibit21_extractor.ipynb | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 9bb3557..a063f50 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -53,9 +53,8 @@
 class TrainConfig(Config):
     """Config for training notebook."""
 
-    layoutlm_uri: str | None = (
-        "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor"
-    )
+    #: mlflow run name used to train layoutlm model
+    layoutlm_training_run: str | None = "layoutlm-labeledv0.2"
 
 
 exhibit21_extractor = define_dagstermill_asset(
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index cc92a1e..69893da 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -52,7 +52,7 @@
     "from mozilla_sec_eia.models.sec10k import defs\n",
     "\n",
     "context = dagstermill.get_context(op_config={\n",
-    "    \"layoutlm_uri\": None,\n",
+    "    \"layoutlm_training_run\": None,\n",
     "})\n",
     "\n",
     "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n",
@@ -222,9 +222,15 @@
     "\n",
     "    return encoding\n",
     "\n",
+    "if (run_name := context.op_config[\"layoutlm_training_run\"]) is not None:\n",
+    "    filter_string = f\"attributes.run_name = '{run_name}'\"\n",
+    "    run = mlflow.search_runs(filter_string=filter_string, output_format=\"list\")[0]\n",
+    "    training_run_id = run.info.run_id\n",
+    "else:\n",
+    "    training_run_id = None\n",
+    "\n",
     "# Only finetune if configured to do so\n",
-    "training_run_id = None\n",
-    "if context.op_config[\"layoutlm_uri\"] is None:\n",
+    "if training_run_id is None:\n",
     "    id2label, label2id = get_id_label_conversions(LABELS)\n",
     "    # Change temp_dir to save training data locally for inspection\n",
     "    # Cache/prepare training data\n",
@@ -497,11 +503,7 @@
     ")\n",
     "\n",
     "# If a model was trained in this notebook, use it. Otherwise, use\n",
-    "if training_run_id is not None:\n",
-    "    model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n",
-    "else:\n",
-    "    model_uri = context.op_config[\"layoutlm_uri\"]\n",
-    "\n",
+    "model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n",
     "model_info = mlflow.models.get_model_info(model_uri)\n",
     "\n",
     "def _get_data(dataset):\n",

From c67a1bef016b398bafd4ad3ee5e3540e1eadc3dc Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sun, 6 Oct 2024 13:40:40 -0400
Subject: [PATCH 096/161] Rework how notebook is configured

---
 src/mozilla_sec_eia/models/sec10k/__init__.py | 10 +---------
 .../models/sec10k/ex_21/data/__init__.py      | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index a063f50..33dd5b8 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -50,17 +50,10 @@
 )
 
 
-class TrainConfig(Config):
-    """Config for training notebook."""
-
-    #: mlflow run name used to train layoutlm model
-    layoutlm_training_run: str | None = "layoutlm-labeledv0.2"
-
-
 exhibit21_extractor = define_dagstermill_asset(
     name="exhibit21_extractor",
     notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"),
-    config_schema=TrainConfig.to_config_schema(),
+    config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(),
     ins={
         "ex21_training_data": AssetIn(),
         "ex21_validation_set": AssetIn(),
@@ -68,7 +61,6 @@ class TrainConfig(Config):
         "ex21_inference_dataset": AssetIn(),
     },
     save_notebook_on_failure=True,
-    partitions_def=ex_21.data.TRAINING_DATA_VERSION_PARTS,
 )
 ex21_training_job = define_asset_job(
     "ex21_training",
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index 06860f1..afa1f7a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -7,7 +7,7 @@
 from dagster import (
     AssetExecutionContext,
     AssetOut,
-    StaticPartitionsDefinition,
+    Config,
     asset,
     multi_asset,
 )
@@ -20,19 +20,24 @@
 from .inference import create_inference_dataset
 from .training import format_as_ner_annotations
 
-TRAINING_DATA_VERSION_PARTS = StaticPartitionsDefinition(
-    ["labeledv0.0", "labeledv0.1", "labeledv0.2"]
-)
 
+class Ex21TrainConfig(Config):
+    """Config for training notebook."""
+
+    #: mlflow run name used to train layoutlm model
+    layoutlm_training_run: str | None = "layoutlm-labeledv0.2"
+    #: training data version (doesn't matter if using pretrained model)
+    training_data_version: str | None = "v0.2"
 
-@asset(partitions_def=TRAINING_DATA_VERSION_PARTS)
-def ex21_training_data(context: AssetExecutionContext):
+
+@asset
+def ex21_training_data(config: Ex21TrainConfig):
     """Construct training dataset for ex 21 extraction."""
     with TemporaryDirectory() as temp_dir:
         ner_annotations = format_as_ner_annotations(
             labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons",
             pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs",
-            gcs_folder_name=context.partition_key,
+            gcs_folder_name=f"labeled{config.training_data_version}",
         )
     return ner_annotations
 

From b8a5b247d4637fb6fd3a1f4e7c4358077a594f3c Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Sun, 6 Oct 2024 15:00:46 -0400
Subject: [PATCH 097/161] Finetune configuration

---
 src/mozilla_sec_eia/models/sec10k/__init__.py                   | 1 -
 src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py        | 2 +-
 .../models/sec10k/notebooks/exhibit21_extractor.ipynb           | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 33dd5b8..8087fb8 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -2,7 +2,6 @@
 
 from dagster import (
     AssetIn,
-    Config,
     Definitions,
     define_asset_job,
     file_relative_path,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index afa1f7a..2ae0b1e 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -27,7 +27,7 @@ class Ex21TrainConfig(Config):
     #: mlflow run name used to train layoutlm model
     layoutlm_training_run: str | None = "layoutlm-labeledv0.2"
     #: training data version (doesn't matter if using pretrained model)
-    training_data_version: str | None = "v0.2"
+    training_data_version: str = "v0.2"
 
 
 @asset
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 69893da..f989b4e 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -52,7 +52,7 @@
     "from mozilla_sec_eia.models.sec10k import defs\n",
     "\n",
     "context = dagstermill.get_context(op_config={\n",
-    "    \"layoutlm_training_run\": None,\n",
+    "    \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n",
     "})\n",
     "\n",
     "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n",

From 45d5cf8f9b280bda8c9ae80f103f98bcb01ca4c3 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 7 Oct 2024 13:11:06 -0400
Subject: [PATCH 098/161] separate inference dataset creation from model
 prediction

---
 .../library/mlflow/mlflow_io_managers.py      |  2 +-
 src/mozilla_sec_eia/library/model_jobs.py     | 15 +++++--
 src/mozilla_sec_eia/models/sec10k/__init__.py |  8 ++--
 .../models/sec10k/ex_21/__init__.py           | 44 ++++++++++++++++---
 .../models/sec10k/ex_21/data/__init__.py      |  1 -
 .../notebooks/exhibit21_extractor.ipynb       | 42 ++++++++++++++++--
 6 files changed, 93 insertions(+), 19 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index 94468f5..abc2d1c 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -46,7 +46,7 @@ def load_input(self, context: InputContext):
         if model_uri is None:
             model_uri = f"models:/{context.name}"
 
-        mlflow.pyfunc.load_model(
+        return mlflow.pyfunc.load_model(
             model_uri,
             dst_path=cache_path,
         )
diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py
index 87f6d15..e3c9801 100644
--- a/src/mozilla_sec_eia/library/model_jobs.py
+++ b/src/mozilla_sec_eia/library/model_jobs.py
@@ -25,6 +25,7 @@ def create_production_model_job(
     job_name: str,
     assets: list[AssetsDefinition],
     concurrency_limit: int | None = None,
+    tag_concurrency_limits: list[dict] | None = None,
     **kwargs,
 ) -> JobDefinition:
     """Construct a dagster job and supply Definitions with assets and resources."""
@@ -39,10 +40,16 @@ def create_production_model_job(
             }
         },
     }
-    if concurrency_limit is not None:
-        config["execution"] = {
-            "config": {"multiprocess": {"max_concurrent": concurrency_limit}}
-        }
+    if (concurrency_limit is not None) or (tag_concurrency_limits is not None):
+        config["execution"] = {"config": {"multiprocess": {}}}
+        if concurrency_limit is not None:
+            config["execution"]["config"]["multiprocess"][
+                "max_concurrent"
+            ] = concurrency_limit
+        else:
+            config["execution"]["config"]["multiprocess"][
+                "tag_concurrency_limits"
+            ] = tag_concurrency_limits
 
     return define_asset_job(
         job_name,
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 8087fb8..f0bb091 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -45,7 +45,9 @@
 ex21_production_job = model_jobs.create_production_model_job(
     "ex21_extraction",
     ex_21.production_assets,
-    concurrency_limit=4,
+    tag_concurrency_limits=[
+        {"key": "model", "value": "exhibit21_extractor", "limit": 2},
+    ],
 )
 
 
@@ -85,10 +87,10 @@
         "mlflow_interface": mlflow_interface_resource,
         "layoutlm_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/b959cfa0ba3c4b91a0f8fe158cd0109f/exhibit21_extractor",
+            uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor",
         ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
-            base_path=UPath("gs://sec10k-outputs")
+            base_path=UPath("gs://sec10k-outputs/v2")
         ),
         "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
     }
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 574074d..b5bc167 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -1,6 +1,7 @@
 """Module for working with exhibit 21 data."""
 
 import logging
+import traceback
 
 import pandas as pd
 from dagster import (
@@ -10,6 +11,7 @@
     graph_multi_asset,
     op,
 )
+from mlflow.pyfunc import PyFuncModel
 
 from ..entities import (
     Ex21CompanyOwnership,
@@ -18,7 +20,8 @@
     sec10k_extract_metadata_type,
 )
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
-from .inference import extract_filings
+from ..utils.cloud import GCSArchive
+from .data.inference import create_inference_dataset
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -29,13 +32,32 @@
         "extracted": Out(dagster_type=ex21_extract_type),
     },
     ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")},
+    tags={"model": "exhibit21_extractor"},
 )
 def extract_filing_chunk(
-    filings: pd.DataFrame,
+    parsed_chunk: tuple[pd.DataFrame, pd.DataFrame],
     exhibit21_extractor,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
-    return extract_filings(filings, exhibit21_extractor)
+    failed_parsing_metadata, inference_dataset = parsed_chunk
+    extracted = Ex21CompanyOwnership.example(size=0)
+    try:
+        if not inference_dataset.empty:
+            metadata, extracted = exhibit21_extractor.predict(inference_dataset)
+            metadata = pd.concat([failed_parsing_metadata, metadata])
+        else:
+            metadata = failed_parsing_metadata
+    except Exception as e:
+        logger.warning(traceback.format_exc())
+        logger.warning(f"Error while extracting filings: {inference_dataset['id']}")
+        metadata = pd.DataFrame(
+            {
+                "filename": inference_dataset["id"],
+                "success": [False] * len(inference_dataset),
+                "notes": [str(e)] * len(inference_dataset),
+            }
+        ).set_index("filename")
+    return metadata, extracted
 
 
 @op(
@@ -65,6 +87,17 @@ def collect_extracted_chunks(
     )
 
 
+@op
+def create_dataset(
+    cloud_interface: GCSArchive, filings: pd.DataFrame
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Construct inference dataset from filing chunk."""
+    return create_inference_dataset(
+        filing_metadata=filings,
+        cloud_interface=cloud_interface,
+    )
+
+
 @graph_multi_asset(
     outs={
         "ex21_extraction_metadata": AssetOut(
@@ -81,9 +114,8 @@ def ex21_extract(
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
-    metadata_chunks, extracted_chunks = filing_chunks.map(
-        lambda filings: extract_filing_chunk(filings)
-    )
+    parsed_chunks = filing_chunks.map(create_dataset)
+    metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk)
     metadata, extracted = collect_extracted_chunks(
         metadata_chunks.collect(), extracted_chunks.collect()
     )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index 2ae0b1e..6c5e8aa 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -5,7 +5,6 @@
 
 import pandas as pd
 from dagster import (
-    AssetExecutionContext,
     AssetOut,
     Config,
     asset,
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index f989b4e..7fc14b5 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -38,14 +38,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
    "metadata": {
     "tags": [
      "parameters"
     ]
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading 11 partitions...\n",
+      "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading partition l from /home/zach/catalyst/workspace/storage/ex21_training_data/l using PickledObjectFilesystemIOManager...\n"
+     ]
+    },
+    {
+     "ename": "NotADirectoryError",
+     "evalue": "[Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNotADirectoryError\u001b[0m                        Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m defs\n\u001b[1;32m      5\u001b[0m context \u001b[38;5;241m=\u001b[39m dagstermill\u001b[38;5;241m.\u001b[39mget_context(op_config\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_training_run\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm-labeledv0.2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      7\u001b[0m })\n\u001b[0;32m----> 9\u001b[0m ex21_training_data \u001b[38;5;241m=\u001b[39m \u001b[43mdefs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mex21_training_data\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabeledv0.2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m ex21_failed_parsing_metadata \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_failed_parsing_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     12\u001b[0m ex21_inference_dataset \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_inference_dataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/definitions_class.py:519\u001b[0m, in \u001b[0;36mDefinitions.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata)\u001b[0m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;129m@public\u001b[39m\n\u001b[1;32m    491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_asset_value\u001b[39m(\n\u001b[1;32m    492\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    498\u001b[0m     metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    499\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m    500\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Load the contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m    501\u001b[0m \n\u001b[1;32m    502\u001b[0m \u001b[38;5;124;03m    Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    517\u001b[0m \u001b[38;5;124;03m        The contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m    518\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_repository_def\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    520\u001b[0m \u001b[43m        \u001b[49m\u001b[43masset_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    521\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    522\u001b[0m \u001b[43m        \u001b[49m\u001b[43minstance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    525\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/repository_definition/repository_definition.py:350\u001b[0m, in \u001b[0;36mRepositoryDefinition.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata, resource_config)\u001b[0m\n\u001b[1;32m    346\u001b[0m normalized_assets_defs_by_key \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m    347\u001b[0m     k: ad \u001b[38;5;28;01mfor\u001b[39;00m ad \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masset_graph\u001b[38;5;241m.\u001b[39massets_defs \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m ad\u001b[38;5;241m.\u001b[39mkeys\n\u001b[1;32m    348\u001b[0m }\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m AssetValueLoader(normalized_assets_defs_by_key, instance\u001b[38;5;241m=\u001b[39minstance) \u001b[38;5;28;01mas\u001b[39;00m loader:\n\u001b[0;32m--> 350\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    351\u001b[0m \u001b[43m        \u001b[49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    352\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    353\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    354\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    355\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresource_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresource_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    356\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/decorator_utils.py:203\u001b[0m, in \u001b[0;36m_wrap_with_pre_call_fn.<locals>.wrapped_with_pre_call_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m condition \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m condition(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    202\u001b[0m     pre_call_fn()\n\u001b[0;32m--> 203\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/asset_value_loader.py:169\u001b[0m, in \u001b[0;36mAssetValueLoader.load_asset_value\u001b[0;34m(self, asset_key, python_type, partition_key, input_definition_metadata, resource_config, metadata)\u001b[0m\n\u001b[1;32m    139\u001b[0m io_manager_config \u001b[38;5;241m=\u001b[39m get_mapped_resource_config(\n\u001b[1;32m    140\u001b[0m     {io_manager_key: io_manager_def}, io_resource_config\n\u001b[1;32m    141\u001b[0m )\n\u001b[1;32m    143\u001b[0m input_context \u001b[38;5;241m=\u001b[39m build_input_context(\n\u001b[1;32m    144\u001b[0m     name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    145\u001b[0m     asset_key\u001b[38;5;241m=\u001b[39masset_key,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    166\u001b[0m     ),\n\u001b[1;32m    167\u001b[0m )\n\u001b[0;32m--> 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_context\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:412\u001b[0m, in \u001b[0;36mUPathIOManager.load_input\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m    410\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    411\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 412\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:396\u001b[0m, in \u001b[0;36mUPathIOManager._load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m    393\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_partitions\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[1;32m    394\u001b[0m     \u001b[38;5;66;03m# load multiple partitions\u001b[39;00m\n\u001b[1;32m    395\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39miscoroutinefunction(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_from_path):\n\u001b[0;32m--> 396\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    397\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    398\u001b[0m         \u001b[38;5;66;03m# load_from_path returns a coroutine, so we need to await the results\u001b[39;00m\n\u001b[1;32m    399\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_partitions_async(context)\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:81\u001b[0m, in \u001b[0;36mUPathIOManager.load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m     78\u001b[0m objs \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m partition_key \u001b[38;5;129;01min\u001b[39;00m context\u001b[38;5;241m.\u001b[39masset_partition_keys:\n\u001b[0;32m---> 81\u001b[0m     obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partition_from_path\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbackcompat_paths\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:  \u001b[38;5;66;03m# in case some partitions were skipped\u001b[39;00m\n\u001b[1;32m     88\u001b[0m         objs[partition_key] \u001b[38;5;241m=\u001b[39m obj\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:307\u001b[0m, in \u001b[0;36mUPathIOManager._load_partition_from_path\u001b[0;34m(self, context, partition_key, path, backcompat_path)\u001b[0m\n\u001b[1;32m    305\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    306\u001b[0m     context\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_loading_input_partition_log_message(path, partition_key))\n\u001b[0;32m--> 307\u001b[0m     obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m obj\n\u001b[1;32m    309\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/fs_io_manager.py:283\u001b[0m, in \u001b[0;36mPickledObjectFilesystemIOManager.load_from_path\u001b[0;34m(self, context, path)\u001b[0m\n\u001b[1;32m    282\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_from_path\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext, path: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUPath\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 283\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mload(file)\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/upath/implementations/local.py:134\u001b[0m, in \u001b[0;36mPosixUPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline, **fsspec_kwargs)\u001b[0m\n\u001b[1;32m    125\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(LocalPath, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mopen(\n\u001b[1;32m    126\u001b[0m         mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[1;32m    127\u001b[0m         buffering\u001b[38;5;241m=\u001b[39mbuffering,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    131\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfsspec_kwargs,\n\u001b[1;32m    132\u001b[0m     )\n\u001b[1;32m    133\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 134\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPosixPath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/pathlib.py:1044\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m   1042\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1043\u001b[0m     encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1044\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io\u001b[38;5;241m.\u001b[39mopen(\u001b[38;5;28mself\u001b[39m, mode, buffering, encoding, errors, newline)\n",
+      "\u001b[0;31mNotADirectoryError\u001b[0m: [Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'"
+     ]
+    }
+   ],
    "source": [
     "import dagstermill\n",
     "\n",
@@ -55,7 +87,7 @@
     "    \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n",
     "})\n",
     "\n",
-    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n",
+    "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n",
     "\n",
     "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n",
     "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n",
@@ -715,7 +747,9 @@
     "        python_model=Ex21Extractor(),\n",
     "        artifacts={\"model_components\": model_uri},\n",
     "        signature=infer_signature(ex21_inference_dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n",
-    "    )"
+    "    )\n",
+    "    mlflow.log_table(extracted, \"extracted_data.json\")\n",
+    "    mlflow.log_table(metadata, \"extraction_metadata.json\")"
    ]
   }
  ],

From 3e15b1f1a107463575bf6a2bb1751fbd61667159 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 7 Oct 2024 14:08:52 -0400
Subject: [PATCH 099/161] Remove deprecated inference module

---
 .../models/sec10k/ex_21/__init__.py           |  2 +-
 .../models/sec10k/ex_21/inference.py          | 40 -------------------
 2 files changed, 1 insertion(+), 41 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index b5bc167..34f6c3a 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -36,7 +36,7 @@
 )
 def extract_filing_chunk(
     parsed_chunk: tuple[pd.DataFrame, pd.DataFrame],
-    exhibit21_extractor,
+    exhibit21_extractor: PyFuncModel,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     """Extract a set of filings and return results."""
     failed_parsing_metadata, inference_dataset = parsed_chunk
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
deleted file mode 100644
index 6f517a3..0000000
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Module for formatting inputs and performing inference with a fine-tuned LayoutLM model."""
-
-import logging
-import traceback
-
-import pandas as pd
-from mlflow.pyfunc import PyFuncModel
-
-from ..entities import Ex21CompanyOwnership
-from ..utils.cloud import GCSArchive
-from .data.inference import create_inference_dataset
-
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-
-
-def extract_filings(
-    filings: pd.DataFrame,
-    layoutlm: PyFuncModel,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """Create huggingface dataset from filings and perform extraction."""
-    try:
-        failed_metadata, dataset = create_inference_dataset(
-            filing_metadata=filings,
-            cloud_interface=GCSArchive(),
-            has_labels=False,
-        )
-        metadata, extracted = layoutlm.predict(dataset)
-        metadata = pd.concat([failed_metadata, metadata])
-    except Exception as e:
-        logger.warning(traceback.format_exc())
-        logger.warning(f"Error while extracting filings: {filings.index}")
-        metadata = pd.DataFrame(
-            {
-                "filename": filings.index,
-                "success": [False] * len(filings),
-                "notes": [str(e)] * len(filings),
-            }
-        ).set_index("filename")
-        extracted = Ex21CompanyOwnership.example(size=0)
-    return metadata, extracted

From 60a1260f8bda16847df9cc48b2d070fd342fa9cc Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 8 Oct 2024 16:22:11 -0400
Subject: [PATCH 100/161] Add notebook for training ex21 classifier

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  28 +-
 .../models/sec10k/ex_21/data/__init__.py      |  45 +++-
 .../exhibit21_layout_classifier.ipynb         | 249 +++++++++++++++++-
 .../validation_data/ex21_layout_histogram.csv | 110 ++++++++
 4 files changed, 426 insertions(+), 6 deletions(-)
 create mode 100644 src/mozilla_sec_eia/package_data/validation_data/ex21_layout_histogram.csv

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index f0bb091..79e181d 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -28,7 +28,7 @@
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
-ex21_training_data_assets = load_assets_from_modules([ex_21.data])
+ex21_data_assets = load_assets_from_modules([ex_21.data])
 shared_assets = load_assets_from_modules([extract])
 
 basic_10k_production_job = model_jobs.create_production_model_job(
@@ -65,7 +65,26 @@
 )
 ex21_training_job = define_asset_job(
     "ex21_training",
-    selection=[exhibit21_extractor] + ex21_training_data_assets,
+    selection=[exhibit21_extractor] + ex_21.data.ex21_extraction_training_assets,
+    executor_def=in_process_executor,
+)
+
+
+exhibit21_layout_classifier = define_dagstermill_asset(
+    name="exhibit21_layout_classifier",
+    notebook_path=file_relative_path(
+        __file__, "notebooks/exhibit21_layout_classifier.ipynb"
+    ),
+    config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(),
+    ins={
+        "ex21_layout_labels": AssetIn(),
+        "ex21_layout_classifier_training_dataset": AssetIn(),
+    },
+    save_notebook_on_failure=True,
+)
+ex21_layout_classifier_training_job = define_asset_job(
+    "ex21_layout_classifier_training",
+    selection=[exhibit21_layout_classifier] + ex_21.data.ex21_layout_classifier_assets,
     executor_def=in_process_executor,
 )
 
@@ -74,13 +93,14 @@
     assets=basic_10k_assets
     + ex21_assets
     + shared_assets
-    + [exhibit21_extractor]
-    + ex21_training_data_assets,
+    + [exhibit21_extractor, exhibit21_layout_classifier]
+    + ex21_data_assets,
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
         ex21_production_job,
         ex21_training_job,
+        ex21_layout_classifier_training_job,
     ],
     resources={
         "cloud_interface": cloud_interface_resource,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
index 6c5e8aa..abdd4e5 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py
@@ -43,7 +43,7 @@ def ex21_training_data(config: Ex21TrainConfig):
 
 @asset(dagster_type=ex21_extract_type)
 def ex21_validation_set() -> pd.DataFrame:
-    """Return dataframe containing basic 10k validation data."""
+    """Return dataframe containing ex 21 validation data."""
     return clean_ex21_validation_set(
         validation_helpers.load_validation_data("ex21_labels.csv")
     )
@@ -78,3 +78,46 @@ def ex21_inference_dataset(
         filing_metadata=ex21_validation_filing_metadata,
         cloud_interface=cloud_interface,
     )
+
+
+@asset
+def ex21_layout_labels() -> pd.DataFrame:
+    """Return dataframe with labels describing layout of validation filings."""
+    return validation_helpers.load_validation_data("ex21_layout_histogram.csv")
+
+
+@asset
+def ex21_layout_classifier_filing_metadata(
+    cloud_interface: GCSArchive,
+    ex21_layout_labels: pd.DataFrame,
+) -> pd.DataFrame:
+    """Get sec 10k filing metadata from validation set."""
+    filing_metadata = cloud_interface.get_metadata()
+    return filing_metadata[filing_metadata.index.isin(ex21_layout_labels["filename"])]
+
+
+@asset
+def ex21_layout_classifier_training_dataset(
+    cloud_interface: GCSArchive,
+    ex21_layout_classifier_filing_metadata: pd.DataFrame,
+) -> pd.DataFrame:
+    """Construct inference dataset for ex 21 extraction."""
+    _, dataset = create_inference_dataset(
+        filing_metadata=ex21_layout_classifier_filing_metadata,
+        cloud_interface=cloud_interface,
+    )
+    return dataset
+
+
+ex21_extraction_training_assets = [
+    ex21_training_data,
+    ex21_validation_set,
+    ex21_validation_filing_metadata,
+    ex21_inference_dataset,
+]
+
+ex21_layout_classifier_assets = [
+    ex21_layout_labels,
+    ex21_layout_classifier_filing_metadata,
+    ex21_layout_classifier_training_dataset,
+]
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
index 1781454..584832c 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -1,9 +1,256 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "618936ab-bda1-4b46-8ee5-dbdfc0090562",
+   "metadata": {},
+   "source": [
+    "## Exhibit 21 layout classifier\n",
+    "Some EX21 filings are formatted as a paragraph of text rather than a structured table. Given that the extraction model is trained/designed to work with a table layout, it tends to perform poorly on these filings. In this notebook we will develop a classifier model to detect these filings, so we can filter them out, and potentially develop a dedicated model to handle them."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a22bfc9d-9487-43ec-b0b7-d5bb6e17f994",
+   "metadata": {},
+   "source": [
+    "### Load labeled layouts from upstream asset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "b4963648-2aac-46a7-9778-8808c1e5eeb2",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n",
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mozilla_sec_eia.models.sec10k import defs\n",
+    "\n",
+    "ex21_layout_labels = defs.load_asset_value(\"ex21_layout_labels\")\n",
+    "ex21_layout_classifier_training_dataset = defs.load_asset_value(\"ex21_layout_classifier_training_dataset\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e72a132-a87b-4827-aef0-0898e72317ca",
+   "metadata": {},
+   "source": [
+    "### Implement method to construct feature dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "ee4ed368-7d01-4cb8-952f-f7941900d669",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.data.common import BBOX_COLS_PDF\n",
+    "\n",
+    "\n",
+    "def calculate_features(record):\n",
+    "    \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n",
+    "    df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n",
+    "    features = {}\n",
+    "    features[\"n_bboxes\"] = len(df)\n",
+    "\n",
+    "    # block density wasn't a very useful feature, maybe rework?\n",
+    "    # Calculate the bounding box density of the area of the page with text\n",
+    "    # x_width = df[\"bottom_right_x_pdf\"].max() - df[\"top_left_x_pdf\"].min()\n",
+    "    # y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n",
+    "    # text_area = x_width * y_height\n",
+    "    # features[\"block_density\"] = features[\"n_bboxes\"] / text_area\n",
+    "\n",
+    "    # Calculate average y-distance between bounding boxes for a given document\n",
+    "    df = df.sort_values(by=[\"top_left_y_pdf\", \"top_left_x_pdf\"])\n",
+    "    y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n",
+    "    features[\"avg_y_distance\"] = y_diffs.mean()\n",
+    "    features[\"std_y_distance\"] = y_diffs.std()\n",
+    "\n",
+    "    # Calculate x-distance to assess horizontal alignment\n",
+    "    x_diffs = df.groupby(\"top_left_y_pdf\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n",
+    "    features[\"avg_x_distance\"] = x_diffs.mean()\n",
+    "    features[\"std_x_distance\"] = x_diffs.std()\n",
+    "\n",
+    "    # Define a small threshold to group bounding boxes that are on the same line\n",
+    "    y_threshold = 0.1\n",
+    "    df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n",
+    "    boxes_per_line = df.groupby(\"line_group\").size()\n",
+    "    features[\"median_boxes_per_line\"] = boxes_per_line.median()\n",
+    "    return pd.Series(features)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44f87fd0-82ad-4564-8476-c0ddd78e1527",
+   "metadata": {},
+   "source": [
+    "### Create training/test sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "f71e2dfc-552d-49e7-b23d-267c2158efe2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X = ex21_layout_classifier_training_dataset.sort_values(by=[\"id\"]).apply(calculate_features, axis=1)\n",
+    "y = np.where(ex21_layout_labels.sort_values(by=[\"filename\"])[\"layout\"] == \"Paragraph\", 1, 0)\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de130cf4-cd52-4dde-8582-145566a0b1f3",
+   "metadata": {},
+   "source": [
+    "### Create mlflow model to wrap classifier"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "08bf5f11-af80-4c65-a005-2a2de49c30b5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import mlflow\n",
+    "\n",
+    "\n",
+    "class Ex21LayoutClassifier(mlflow.pyfunc.PythonModel):\n",
+    "    \"\"\"Wrap sklearn classifier in mlflow pyfunc model.\"\"\"\n",
+    "\n",
+    "    def load_context(self, context):\n",
+    "        \"\"\"Load sklearn model.\"\"\"\n",
+    "        self.model = mlflow.sklearn.load_model(context.artifacts[\"layout_classifier\"])\n",
+    "\n",
+    "    def predict(self, context, model_input: pd.DataFrame):\n",
+    "        \"\"\"Create feature matrix from inference dataset and use trained model for prediction.\"\"\"\n",
+    "        features_df = model_input.apply(calculate_features, axis=1)\n",
+    "        return self.model.predict(features_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3a2c6a1-cdc1-4fd0-a1ca-a5d5cc34d139",
+   "metadata": {},
+   "source": [
+    "### Train and log model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "55d2194e-82a8-4d1e-8318-a8c893dc29de",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/08 16:10:39 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n",
+      "2024/10/08 16:10:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n",
+      "2024/10/08 16:10:40 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
+      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
+      "\n",
+      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
+      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
+      "Please also refer to the documentation for alternative solver options:\n",
+      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
+      "  n_iter_i = _check_optimize_result(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dd07e390d13f4f6692ae96288ffb1dbb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-shrimp-450 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/08802dbf347c4cd5b66751c11328a06f.\n",
+      "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n",
+      "2024/10/08 16:11:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/08 16:11:31 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "from mlflow.models import infer_signature\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "\n",
+    "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
+    "\n",
+    "load_dotenv()\n",
+    "\n",
+    "\n",
+    "configure_mlflow()\n",
+    "mlflow.set_experiment(\"exhibit21_layout_classifier\")\n",
+    "\n",
+    "# Autolog sklearn model\n",
+    "mlflow.autolog()\n",
+    "\n",
+    "model = LogisticRegression()\n",
+    "pyfunc_model = Ex21LayoutClassifier()\n",
+    "with mlflow.start_run():\n",
+    "    model.fit(X_train, y_train)\n",
+    "    model.score(X_test, y_test)\n",
+    "    sklearn_model_uri = mlflow.get_artifact_uri(\"model\")\n",
+    "    mlflow.pyfunc.log_model(\n",
+    "        artifact_path=\"exhibit21_layout_classifier\",\n",
+    "        python_model=pyfunc_model,\n",
+    "        artifacts={\"layout_classifier\": sklearn_model_uri},\n",
+    "        signature=infer_signature(ex21_layout_classifier_training_dataset, y),\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "80cda90e-c2cb-4b71-b10d-cb23d7b51b3f",
+   "id": "ed43ef26-0884-4e09-9709-8d7567f87f73",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/src/mozilla_sec_eia/package_data/validation_data/ex21_layout_histogram.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_layout_histogram.csv
new file mode 100644
index 0000000..e18c5a6
--- /dev/null
+++ b/src/mozilla_sec_eia/package_data/validation_data/ex21_layout_histogram.csv
@@ -0,0 +1,110 @@
+,filename,layout,labeler_initials
+0,edgar/data/38079/0001558370-16-004332.txt,Subsidiary List,KL
+1,edgar/data/59527/0000059527-20-000007.txt,Blue & White Table (3 Column),KL
+2,edgar/data/61339/0001161728-17-000004.txt,List with Sentences,KL
+3,edgar/data/107815/0000107815-17-000106.txt,Generic Table,KL
+4,edgar/data/1317577/0001193125-13-356794.txt,Subsidiary List,KL
+5,edgar/data/40545/0000040545-04-000013.txt,Generic Table,KL
+6,edgar/data/39547/0001047469-03-024149.txt,List with Indented Nested Subsidiaries,KL
+7,edgar/data/49728/0001144204-11-070058.txt,Generic Table,KL
+8,edgar/data/200155/0000021267-99-000027.txt,Paragraph,KL
+9,edgar/data/315858/0000315858-19-000023.txt,Generic Table,KL
+10,edgar/data/718877/0001047469-08-007085.txt,Generic Table,KL
+11,edgar/data/811669/0000950123-05-002610.txt,List with Indented Nested Subsidiaries,KL
+12,edgar/data/857501/0001065949-17-000087.txt,Generic Table,KL
+13,edgar/data/874501/0000874501-15-000013.txt,Generic Table,KL
+14,edgar/data/908255/0000908255-13-000006.txt,Subsidiary List,KL
+15,edgar/data/913614/0000930661-01-502777.txt,Paragraph,KL
+16,edgar/data/923472/0000892569-97-000821.txt,Paragraph,KL
+17,edgar/data/1484565/0001564590-20-008705.txt,Generic Table,KL
+18,edgar/data/1582244/0001582244-16-000187.txt,Generic Table,KL
+19,edgar/data/1592386/0001592386-20-000003.txt,Blue & White Table (2 Column),KL
+20,edgar/data/1739566/0001739566-21-000088.txt,Blue & White Table (2 Column),KL
+21,edgar/data/1394159/0001394159-15-000045.txt,Generic Table,KL
+22,edgar/data/3146/0001193125-06-055140.txt,Subsidiary List,KL
+23,edgar/data/354707/0000354707-19-000043.txt,List with Indented Nested Subsidiaries,KL
+24,edgar/data/84557/0001046861-06-000007.txt,Generic Table,KL
+25,edgar/data/100826/0001193125-09-042636.txt,List with Indented Nested Subsidiaries,KL
+26,edgar/data/81033/0000950117-06-000927.txt,Paragraph,KL
+27,edgar/data/4904/0000004904-09-000040.txt,List with Indented Nested Subsidiaries,KL
+28,edgar/data/46207/0001104659-13-011461.txt,List with Indented Nested Subsidiaries,KL
+29,edgar/data/205402/0000950114-99-000043.txt,Paragraph,KL
+30,edgar/data/9342/0000009342-95-000008.txt,Paragraph,KL
+31,edgar/data/9534/0000897069-05-000574.txt,Generic Table,KL
+32,edgar/data/18647/0001169232-08-000603.txt,Table with 2 Subsidiary Name Columns,KL
+33,edgar/data/20947/0001031296-06-000044.txt,Subsidiary List,KL
+34,edgar/data/34067/0001104659-06-016592.txt,Blue & White Table (2 Column),KL
+35,edgar/data/38725/0000038725-17-000042.txt,Blue & White Table (3 Column),KL
+36,edgar/data/60549/0001047469-98-012481.txt,Paragraph,KL
+37,edgar/data/61986/0000061986-99-000003.txt,Paragraph,KL
+38,edgar/data/71675/0001046861-02-000012.txt,Generic Table,KL
+39,edgar/data/77227/0001031296-09-000008.txt,Subsidiary List,KL
+40,edgar/data/78778/0000078778-97-000019.txt,Paragraph,KL
+41,edgar/data/78890/0000078890-14-000004.txt,List with Indented Nested Subsidiaries,KL
+42,edgar/data/80812/0000927016-98-004349.txt,Paragraph,KL
+43,edgar/data/86521/0000086521-10-000019.txt,Generic Table,KL
+44,edgar/data/92487/0000004904-21-000010.txt,List with Indented Nested Subsidiaries,KL
+45,edgar/data/96271/0001193125-07-042781.txt,Subsidiary List,KL
+46,edgar/data/99250/0000099250-00-000002.txt,Paragraph,KL
+47,edgar/data/100122/0000941138-03-000007.txt,Paragraph,KL
+48,edgar/data/103872/0001193125-13-444053.txt,Generic Table,KL
+49,edgar/data/320575/0001193125-07-117419.txt,List with Sentences,KL
+50,edgar/data/3499/0000003499-08-000003.txt,Subsidiary List,KL
+51,edgar/data/3570/0000003570-17-000052.txt,Blue & White Table (2 Column),KL
+52,edgar/data/4127/0000004127-17-000033.txt,Generic Table,KL
+53,edgar/data/4962/0001193125-10-041232.txt,List with Indented Nested Subsidiaries,KL
+54,edgar/data/5981/0001193125-12-106666.txt,Generic Table,KL
+55,edgar/data/11199/0001104659-06-016718.txt,Blue & White Table (3 Column),KL
+56,edgar/data/29644/0001628280-16-019746.txt,Generic Table,KL
+57,edgar/data/38723/0000038723-09-000029.txt,List with Sentences,KL
+58,edgar/data/320340/0000950123-10-027168.txt,Blue & White Table (2 Column),KL
+59,edgar/data/716646/0000950135-06-004150.txt,Generic Table,KL
+60,edgar/data/719402/0001193125-14-113892.txt,List with Indented Nested Subsidiaries,KL
+61,edgar/data/749660/0001193125-12-104800.txt,Generic Table,KL
+62,edgar/data/56679/0001193125-16-634657.txt,Generic Table,KL
+63,edgar/data/75829/0001206774-11-002167.txt,Generic Table,KL
+64,edgar/data/89800/0000089800-18-000004.txt,Generic Table,KL
+65,edgar/data/799233/0000799233-13-000013.txt,Generic Table,KL
+66,edgar/data/804328/0001234452-15-000271.txt,Generic Table,KL
+67,edgar/data/821127/0000821127-11-000003.txt,List with Sentences,KL
+68,edgar/data/869495/0001213900-18-002720.txt,Generic Table,KL
+69,edgar/data/860546/0001104659-07-015618.txt,Subsidiary List,KL
+70,edgar/data/875622/0001140361-17-012337.txt,List with Sentences,KL
+71,edgar/data/891014/0000891014-11-000007.txt,Generic Table,KL
+72,edgar/data/2024/0000002024-95-000007.txt,Paragraph,
+73,edgar/data/790070/0001047469-04-006200.txt,Paragraph,
+74,edgar/data/791905/0000791905-98-000012.txt,Paragraph,
+75,edgar/data/804104/0000950128-98-000667.txt,Paragraph,
+76,edgar/data/885708/0000950137-98-001313.txt,Paragraph,
+77,edgar/data/887921/0000950136-01-000603.txt,Paragraph,
+78,edgar/data/888711/0000927016-97-000977.txt,Paragraph,
+79,edgar/data/906247/0000950123-98-006267.txt,Paragraph,
+80,edgar/data/908837/0001012364-99-000027.txt,Paragraph,
+81,edgar/data/944400/0000950134-03-015913.txt,Paragraph,
+82,edgar/data/944480/0000944480-04-000013.txt,Paragraph,
+83,edgar/data/945093/0000936392-97-000426.txt,Paragraph,
+84,edgar/data/1000787/0001032210-02-000502.txt,Paragraph,
+85,edgar/data/1013050/0000912057-97-022333.txt,Paragraph,
+86,edgar/data/1014739/0001089355-01-000290.txt,Paragraph,
+87,edgar/data/1021285/0000941157-99-000017.txt,Paragraph,
+88,edgar/data/1029102/0000950170-00-000513.txt,Paragraph,
+89,edgar/data/1068717/0000950152-05-003152.txt,Paragraph,
+90,edgar/data/1091973/0000893220-03-000418.txt,Paragraph,
+91,edgar/data/1093672/0001654954-23-003112.txt,Paragraph,
+92,edgar/data/1096343/0000916641-01-500015.txt,Paragraph,
+93,edgar/data/1257296/0000950124-04-001631.txt,Paragraph,
+94,edgar/data/6314/0000891092-07-004187.txt,Paragraph,
+95,edgar/data/9466/0001193125-14-051838.txt,Generic Table,
+96,edgar/data/20290/0001326160-19-000057.txt,Subsidiary List,
+97,edgar/data/32604/0000032604-97-000015.txt,Paragraph,
+98,edgar/data/32689/0001047469-09-001643.txt,Generic Table,
+99,edgar/data/35214/0000950152-04-001894.txt,Paragraph,
+100,edgar/data/43350/0001144204-17-014878.txt,List with Sentences,
+101,edgar/data/46738/0000950131-01-001406.txt,Generic Table,
+102,edgar/data/54507/0000054507-18-000012.txt,Generic Table,
+103,edgar/data/57183/0001068800-04-000659.txt,Paragraph,
+104,edgar/data/68589/0000068589-11-000002.txt,List with Sentences,
+105,edgar/data/70145/0001193125-11-321222.txt,Generic Table,
+106,edgar/data/99780/0000099780-20-000025.txt,Blue & White Table (3 Column),
+107,edgar/data/103682/0001193125-05-038710.txt,Table with 2 Subsidiary Name Columns,
+108,edgar/data/104819/0001193125-14-422013.txt,Generic Table,

From 4105110d18db7c49f1961431be6103a12d7d6114 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 8 Oct 2024 17:53:53 -0400
Subject: [PATCH 101/161] Pull in model updates

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |   4 +
 src/mozilla_sec_eia/models/sec10k/entities.py |  11 ++
 .../models/sec10k/ex_21/__init__.py           |  41 ++++++-
 .../notebooks/exhibit21_extractor.ipynb       | 104 +++++++++++-------
 .../validation_data/ex21_labels.csv           |  48 +++++---
 5 files changed, 152 insertions(+), 56 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 79e181d..2ecf3c2 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -109,6 +109,10 @@
             mlflow_interface=mlflow_interface_resource,
             uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor",
         ),
+        "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
+            mlflow_interface=mlflow_interface_resource,
+            uri="runs:/08802dbf347c4cd5b66751c11328a06f/exhibit21_layout_classifier",
+        ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")
         ),
diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py
index b0f6869..2ee5b23 100644
--- a/src/mozilla_sec_eia/models/sec10k/entities.py
+++ b/src/mozilla_sec_eia/models/sec10k/entities.py
@@ -53,6 +53,17 @@ class Sec10kExtractionMetadata(pa.DataFrameModel):
     )
 
 
+class Ex21Layout(pa.DataFrameModel):
+    """Define table structure for ex21 layout classification."""
+
+    filename: Index[str] = pa.Field(description="Name of extracted filing.")
+    paragraph: Series[bool] = pa.Field(
+        description="Indicates whether ex21 is formatted as a paragraph or not.",
+        coerce=True,
+    )
+
+
 ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership)
 basic_10k_extract_type = pandera_schema_to_dagster_type(Basic10kCompanyInfo)
 sec10k_extract_metadata_type = pandera_schema_to_dagster_type(Sec10kExtractionMetadata)
+ex21_layout_type = pandera_schema_to_dagster_type(Ex21Layout)
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 34f6c3a..9a70b74 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -15,8 +15,10 @@
 
 from ..entities import (
     Ex21CompanyOwnership,
+    Ex21Layout,
     Sec10kExtractionMetadata,
     ex21_extract_type,
+    ex21_layout_type,
     sec10k_extract_metadata_type,
 )
 from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions
@@ -60,6 +62,28 @@ def extract_filing_chunk(
     return metadata, extracted
 
 
+@op(
+    out={"layout": Out(dagster_type=ex21_layout_type)},
+    ins={
+        "exhibit21_layout_classifier": In(
+            input_manager_key="ex21_classifier_io_manager"
+        )
+    },
+)
+def classify_chunk_layouts(
+    parsed_chunk: tuple[pd.DataFrame, pd.DataFrame],
+    exhibit21_layout_classifier: PyFuncModel,
+) -> pd.DataFrame:
+    """Extract a set of filings and return results."""
+    _, inference_dataset = parsed_chunk
+    return pd.DataFrame(
+        {
+            "filename": inference_dataset["id"],
+            "paragraph": exhibit21_layout_classifier.predict(inference_dataset),
+        }
+    ).set_index("filename")
+
+
 @op(
     out={
         "metadata": Out(
@@ -70,20 +94,27 @@ def extract_filing_chunk(
             io_manager_key="pandas_parquet_io_manager",
             dagster_type=ex21_extract_type,
         ),
+        "layout": Out(
+            io_manager_key="pandas_parquet_io_manager",
+            dagster_type=ex21_layout_type,
+        ),
     }
 )
 def collect_extracted_chunks(
     metadata_dfs: list[pd.DataFrame],
     extracted_dfs: list[pd.DataFrame],
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+    layout_dfs: list[pd.DataFrame],
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Collect chunks of extracted filings."""
     metadata_dfs = [df for df in metadata_dfs if not df.empty]
     extracted_dfs = [df for df in extracted_dfs if not df.empty]
     metadata_df = pd.concat(metadata_dfs)
     extracted_df = pd.concat(extracted_dfs)
+    layout_df = (pd.concat(layout_dfs),)
     return (
         Sec10kExtractionMetadata.validate(metadata_df),
         Ex21CompanyOwnership.validate(extracted_df),
+        Ex21Layout.validate(layout_df),
     )
 
 
@@ -106,6 +137,7 @@ def create_dataset(
         "ex21_company_ownership_info": AssetOut(
             io_manager_key="pandas_parquet_io_manager"
         ),
+        "ex21_layout": AssetOut(io_manager_key="pandas_parquet_io_manager"),
     },
     partitions_def=year_quarter_partitions,
 )
@@ -115,12 +147,11 @@ def ex21_extract(
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
     parsed_chunks = filing_chunks.map(create_dataset)
+    layout_chunks = parsed_chunks.map(classify_chunk_layouts)
     metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk)
-    metadata, extracted = collect_extracted_chunks(
-        metadata_chunks.collect(), extracted_chunks.collect()
+    return collect_extracted_chunks(
+        metadata_chunks.collect(), extracted_chunks.collect(), layout_chunks.collect()
     )
 
-    return metadata, extracted
-
 
 production_assets = [sec10k_filing_metadata, ex21_extract]
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 7fc14b5..e155387 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -38,46 +38,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
    "metadata": {
     "tags": [
      "parameters"
     ]
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading 11 partitions...\n",
-      "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading partition l from /home/zach/catalyst/workspace/storage/ex21_training_data/l using PickledObjectFilesystemIOManager...\n"
-     ]
-    },
-    {
-     "ename": "NotADirectoryError",
-     "evalue": "[Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNotADirectoryError\u001b[0m                        Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m defs\n\u001b[1;32m      5\u001b[0m context \u001b[38;5;241m=\u001b[39m dagstermill\u001b[38;5;241m.\u001b[39mget_context(op_config\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m      6\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_training_run\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm-labeledv0.2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m      7\u001b[0m })\n\u001b[0;32m----> 9\u001b[0m ex21_training_data \u001b[38;5;241m=\u001b[39m \u001b[43mdefs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mex21_training_data\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabeledv0.2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     11\u001b[0m ex21_failed_parsing_metadata \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_failed_parsing_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     12\u001b[0m ex21_inference_dataset \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_inference_dataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/definitions_class.py:519\u001b[0m, in \u001b[0;36mDefinitions.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata)\u001b[0m\n\u001b[1;32m    490\u001b[0m \u001b[38;5;129m@public\u001b[39m\n\u001b[1;32m    491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_asset_value\u001b[39m(\n\u001b[1;32m    492\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    498\u001b[0m     metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    499\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m    500\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Load the contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m    501\u001b[0m \n\u001b[1;32m    502\u001b[0m \u001b[38;5;124;03m    Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    517\u001b[0m \u001b[38;5;124;03m        The contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m    518\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_repository_def\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    520\u001b[0m \u001b[43m        \u001b[49m\u001b[43masset_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    521\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    522\u001b[0m \u001b[43m        \u001b[49m\u001b[43minstance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    525\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/repository_definition/repository_definition.py:350\u001b[0m, in \u001b[0;36mRepositoryDefinition.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata, resource_config)\u001b[0m\n\u001b[1;32m    346\u001b[0m normalized_assets_defs_by_key \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m    347\u001b[0m     k: ad \u001b[38;5;28;01mfor\u001b[39;00m ad \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masset_graph\u001b[38;5;241m.\u001b[39massets_defs \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m ad\u001b[38;5;241m.\u001b[39mkeys\n\u001b[1;32m    348\u001b[0m }\n\u001b[1;32m    349\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m AssetValueLoader(normalized_assets_defs_by_key, instance\u001b[38;5;241m=\u001b[39minstance) \u001b[38;5;28;01mas\u001b[39;00m loader:\n\u001b[0;32m--> 350\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    351\u001b[0m \u001b[43m        \u001b[49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    352\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    353\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    354\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    355\u001b[0m \u001b[43m        \u001b[49m\u001b[43mresource_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresource_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    356\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/decorator_utils.py:203\u001b[0m, in \u001b[0;36m_wrap_with_pre_call_fn.<locals>.wrapped_with_pre_call_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m condition \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m condition(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    202\u001b[0m     pre_call_fn()\n\u001b[0;32m--> 203\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/asset_value_loader.py:169\u001b[0m, in \u001b[0;36mAssetValueLoader.load_asset_value\u001b[0;34m(self, asset_key, python_type, partition_key, input_definition_metadata, resource_config, metadata)\u001b[0m\n\u001b[1;32m    139\u001b[0m io_manager_config \u001b[38;5;241m=\u001b[39m get_mapped_resource_config(\n\u001b[1;32m    140\u001b[0m     {io_manager_key: io_manager_def}, io_resource_config\n\u001b[1;32m    141\u001b[0m )\n\u001b[1;32m    143\u001b[0m input_context \u001b[38;5;241m=\u001b[39m build_input_context(\n\u001b[1;32m    144\u001b[0m     name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    145\u001b[0m     asset_key\u001b[38;5;241m=\u001b[39masset_key,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    166\u001b[0m     ),\n\u001b[1;32m    167\u001b[0m )\n\u001b[0;32m--> 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_context\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:412\u001b[0m, in \u001b[0;36mUPathIOManager.load_input\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m    410\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    411\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 412\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:396\u001b[0m, in \u001b[0;36mUPathIOManager._load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m    393\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_partitions\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[1;32m    394\u001b[0m     \u001b[38;5;66;03m# load multiple partitions\u001b[39;00m\n\u001b[1;32m    395\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39miscoroutinefunction(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_from_path):\n\u001b[0;32m--> 396\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    397\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    398\u001b[0m         \u001b[38;5;66;03m# load_from_path returns a coroutine, so we need to await the results\u001b[39;00m\n\u001b[1;32m    399\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_partitions_async(context)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:81\u001b[0m, in \u001b[0;36mUPathIOManager.load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m     78\u001b[0m objs \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m partition_key \u001b[38;5;129;01min\u001b[39;00m context\u001b[38;5;241m.\u001b[39masset_partition_keys:\n\u001b[0;32m---> 81\u001b[0m     obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partition_from_path\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m        \u001b[49m\u001b[43mbackcompat_paths\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:  \u001b[38;5;66;03m# in case some partitions were skipped\u001b[39;00m\n\u001b[1;32m     88\u001b[0m         objs[partition_key] \u001b[38;5;241m=\u001b[39m obj\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:307\u001b[0m, in \u001b[0;36mUPathIOManager._load_partition_from_path\u001b[0;34m(self, context, partition_key, path, backcompat_path)\u001b[0m\n\u001b[1;32m    305\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m    306\u001b[0m     context\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_loading_input_partition_log_message(path, partition_key))\n\u001b[0;32m--> 307\u001b[0m     obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    308\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m obj\n\u001b[1;32m    309\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/fs_io_manager.py:283\u001b[0m, in \u001b[0;36mPickledObjectFilesystemIOManager.load_from_path\u001b[0;34m(self, context, path)\u001b[0m\n\u001b[1;32m    282\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_from_path\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext, path: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUPath\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 283\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m    284\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mload(file)\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/upath/implementations/local.py:134\u001b[0m, in \u001b[0;36mPosixUPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline, **fsspec_kwargs)\u001b[0m\n\u001b[1;32m    125\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(LocalPath, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mopen(\n\u001b[1;32m    126\u001b[0m         mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[1;32m    127\u001b[0m         buffering\u001b[38;5;241m=\u001b[39mbuffering,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    131\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfsspec_kwargs,\n\u001b[1;32m    132\u001b[0m     )\n\u001b[1;32m    133\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 134\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPosixPath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/pathlib.py:1044\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m   1042\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m   1043\u001b[0m     encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1044\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io\u001b[38;5;241m.\u001b[39mopen(\u001b[38;5;28mself\u001b[39m, mode, buffering, encoding, errors, newline)\n",
-      "\u001b[0;31mNotADirectoryError\u001b[0m: [Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import dagstermill\n",
     "\n",
@@ -362,6 +330,50 @@
     ")\n",
     "\n",
     "\n",
+    "def separate_entities_by_row(df):\n",
+    "    \"\"\"Separate entities that span multiple rows and should be distinct.\n",
+    "\n",
+    "    Sometimes LayoutLM groups multiple entities that span multiple rows\n",
+    "    into one entity. This function makes an attempt to break these out\n",
+    "    into multiple entities, by taking the average distance between rows\n",
+    "    and separating a grouped entity if the distance between y values\n",
+    "    is greater than the third quantile of y value spacing.\n",
+    "    \"\"\"\n",
+    "    threshold = 1.0\n",
+    "    for entity in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
+    "        entity_df = df[df[\"pred\"] == entity]\n",
+    "        entity_df[\"line_group\"] = entity_df[\"top_left_y\"].transform(\n",
+    "            lambda y: (y // threshold).astype(int)\n",
+    "        )\n",
+    "        # Get the unique y-values for each line (group) per file\n",
+    "        line_positions = (\n",
+    "            entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n",
+    "        )\n",
+    "        # Calculate the difference between adjacent y-values (i.e., distance between lines)\n",
+    "        line_positions[\"y_diff\"] = line_positions[\"top_left_y\"].diff()\n",
+    "        # Filter out NaN values and take the mean of the valid distances\n",
+    "        y_diffs = line_positions[\"y_diff\"].dropna()\n",
+    "        avg_y_diff = y_diffs.apply(np.floor).mean()\n",
+    "        # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n",
+    "        entity_df[\"prev_y\"] = entity_df[\"top_left_y\"].shift(1)\n",
+    "        entity_df[\"prev_iob\"] = entity_df[\"iob_pred\"].shift(1)\n",
+    "\n",
+    "        # If the current prediction is an I label\n",
+    "        # and y distance exceeds the average y difference\n",
+    "        # update to a B label and make it the start of a new entity\n",
+    "        entity_df[\"iob_pred\"] = np.where(\n",
+    "            (entity_df[\"iob_pred\"].str[0] == \"I\")\n",
+    "            & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n",
+    "            \"B\" + entity_df[\"iob_pred\"].str[1:],  # Update to 'B'\n",
+    "            entity_df[\"iob_pred\"],  # Keep as is\n",
+    "        )\n",
+    "\n",
+    "        # Drop temporary columns\n",
+    "        entity_df = entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n",
+    "        df.update(entity_df, overwrite=True)\n",
+    "\n",
+    "    return df\n",
+    "\n",
     "class LayoutLMInferencePipeline(Pipeline):\n",
     "    \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
     "\n",
@@ -485,8 +497,10 @@
     "        )\n",
     "        df.update(first_in_group_df)\n",
     "        # filter for just words that were labeled with non \"other\" entities\n",
-    "        entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
-    "        entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n",
+    "        entities_df = df[df[\"pred\"] != \"other\"]\n",
+    "        # boxes that have the same group label but are on different rows\n",
+    "        # should be updated to have two different B labels\n",
+    "        entities_df = separate_entities_by_row(entities_df)\n",
     "        # words are labeled with IOB format which stands for inside, outside, beginning\n",
     "        # merge B and I entities to form one entity group\n",
     "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
@@ -541,6 +555,21 @@
     "def _get_data(dataset):\n",
     "    yield from dataset\n",
     "\n",
+    "def _fill_known_nulls(df):\n",
+    "    \"\"\"Fill known nulls in location and own per column.\n",
+    "\n",
+    "    Fill with known values from rows with same subsidiary.\n",
+    "    \"\"\"\n",
+    "    if \"own_per\" in df:\n",
+    "        df[\"own_per\"] = df.groupby([\"id\", \"subsidiary\"])[\"own_per\"].transform(\n",
+    "            lambda group: group.ffill()\n",
+    "        )\n",
+    "    if \"loc\" in df:\n",
+    "        df[\"loc\"] = df.groupby([\"id\", \"subsidiary\"])[\"loc\"].transform(\n",
+    "            lambda group: group.ffill()\n",
+    "        )\n",
+    "    return df\n",
+    "\n",
     "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n",
     "    \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n",
     "    def load_context(self, context):\n",
@@ -583,7 +612,8 @@
     "            all_output_df = pd.concat([all_output_df, output_df])\n",
     "        all_output_df.columns.name = None\n",
     "        all_output_df = clean_extracted_df(all_output_df)\n",
-    "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n",
+    "        all_output_df = _fill_known_nulls(all_output_df)\n",
+    "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]].drop_duplicates()\n",
     "        all_output_df = all_output_df.reset_index(drop=True)\n",
     "        return extraction_metadata, all_output_df\n",
     "\n",
diff --git a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
index 006f344..b5dc6aa 100644
--- a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
+++ b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv
@@ -515,7 +515,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 857501-0001065949-17-000087,First Surety Corporation,West Virginia,
 857501-0001065949-17-000087,"Crystal Mountain Water, Inc.",Arkansas,
 874501-0000874501-15-000013,Ambac Assurance Corporation,Wisconsin,
-874501-0000874501-15-000013,Ambac Assurance UK Limited,United Kingdom Insurance Company,
+874501-0000874501-15-000013,Ambac Assurance UK Limited,United Kingdom,
 874501-0000874501-15-000013,Ambac Capital Corporation,Delaware,
 874501-0000874501-15-000013,"Ambac Capital Funding, Inc.",Delaware,
 874501-0000874501-15-000013,"Ambac Credit Products, LLC",Delaware,
@@ -751,7 +751,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 100826-0001193125-09-042636,Illinois Power Securitization Limited Liability Company,Delaware,
 100826-0001193125-09-042636,Illinois Power Special Purpose Trust,Delaware,
 100826-0001193125-09-042636,Union Electric Company,Missouri,
-100826-0001193125-09-042636,Fuelco LLC,Delaware,33.3
+100826-0001193125-09-042636,Fuelco LLC,Delaware,33.33
 4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York,
 4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100.0
 4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100.0
@@ -1443,13 +1443,13 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Malaysia Branch",Malaysia,
 4127-0000004127-17-000033,Advanced Analogic Technologies Incorporated,Delaware,
 4127-0000004127-17-000033,"Advanced Analogic Technologies (China), Inc.",Peoples Republic of China,
-4127-0000004127-17-000033,Axiom Microdevices Inc.,Delaware,
-4127-0000004127-17-000033,ICWave LLC,Massachusetts,
-4127-0000004127-17-000033,Isolink inc.,California,
-4127-0000004127-17-000033,MEMS Solutions Inc.,Korea,
-4127-0000004127-17-000033,Quantance Inc.,Delaware,
-4127-0000004127-17-000033,SiGe Semiconductor Inc.,Delaware,
-4127-0000004127-17-000033,SiGe Semiconductor (U.S.) Corp.,Delaware,
+4127-0000004127-17-000033,"Axiom Microdevices, Inc.",Delaware,
+4127-0000004127-17-000033,"ICWave, LLC",Massachusetts,
+4127-0000004127-17-000033,"Isolink, Inc.",California,
+4127-0000004127-17-000033,"MEMS Solutions, Inc.",Korea,
+4127-0000004127-17-000033,"Quantance, Inc.",Delaware,
+4127-0000004127-17-000033,"SiGe Semiconductor, Inc.",Delaware,
+4127-0000004127-17-000033,"SiGe Semiconductor (U.S.), Corp.",Delaware,
 4127-0000004127-17-000033,SiGe Semiconductor (Europe) Limited,United Kingdom,
 4127-0000004127-17-000033,"Trans-Tech, Inc.",Maryland,
 4962-0001193125-10-041232,American Express Company,(USA) New York,
@@ -1627,8 +1627,8 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 11199-0001104659-06-016718,"Bemis Europe Holdings, S.A.",Belgium,100.0
 11199-0001104659-06-016718,Bemis Monceau S.A.,Belgium,100.0
 11199-0001104659-06-016718,Techy France S.A.R.L.,France,100.0
-11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,100.0
-11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,100.0
+11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,87.0
+11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,86.0
 11199-0001104659-06-016718,Bemis France Holdings S.A.S.,France,100.0
 11199-0001104659-06-016718,Bemis Packaging France S.A.S.,France,100.0
 11199-0001104659-06-016718,Bemis Le Trait S.A.S.,France,100.0
@@ -1684,6 +1684,26 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 11199-0001104659-06-016718,"Electronic Printing Products, Inc.",Ohio,100.0
 11199-0001104659-06-016718,Enterprise Software Inc.,Ohio,100.0
 11199-0001104659-06-016718,"MACtac Engineered Products, Inc.",Ohio,100.0
+11199-0001104659-06-016718,MACtac Europe S.A.,Belgium,89.0
+11199-0001104659-06-016718,Bemis Coordination Center S.A.,Belgium,67.0
+11199-0001104659-06-016718,Bemis Polska Sp. z o.o.,Poland,100.0
+11199-0001104659-06-016718,MACtac Asia-Pacific Self-Adhesive Products Pte Ltd.,Singapore,100.0
+11199-0001104659-06-016718,MACtac Deutschland GmbH,Germany,100.0
+11199-0001104659-06-016718,MACtac France E.U.R.L.,France,100.0
+11199-0001104659-06-016718,Multi-Fix N.V.,Belgium,100.0
+11199-0001104659-06-016718,MACtac Scandinavia A.B.,Sweden,100.0
+11199-0001104659-06-016718,MACtac Canada Limited/Limitee,Canada,100.0
+11199-0001104659-06-016718,MACtac Europe S.A.,Belgium,11.0
+11199-0001104659-06-016718,MACtac A.G.,Switzerland,100.0
+11199-0001104659-06-016718,"MACtac Mexico, S.A. de C.V.",Mexico,51.0
+11199-0001104659-06-016718,"MACtac Mexico Servicios, S.A. de C.V.",Mexico,51.0
+11199-0001104659-06-016718,"Morgan Adhesives America do Sul, Ltda.",Brazil,100.0
+11199-0001104659-06-016718,Paramount Packaging Corporation,Delaware,100.0
+11199-0001104659-06-016718,Bemis Elsham Limited,United Kingdom,100.0
+11199-0001104659-06-016718,"Bemis Shelbyville, Inc.",Tennessee,100.0
+11199-0001104659-06-016718,"Bemis Longview, Inc.",Texas,100.0
+11199-0001104659-06-016718,"PPC Royalty, Inc.",Delaware,100.0
+11199-0001104659-06-016718,"Pervel Industries, Inc.",Delaware,100.0
 29644-0001628280-16-019746,"Aerospace Filtration Systems, Inc.","Chesterfield, MO USA",
 29644-0001628280-16-019746,ASHC LLC,"Minneapolis, MN USA",
 29644-0001628280-16-019746,DLX Capital S.a.r.l.,"Luxembourg City, Luxembourg",
@@ -1937,15 +1957,15 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage
 75829-0001206774-11-002167,Pall Austria Filter GesmbH,Austria,
 75829-0001206774-11-002167,Pall (Canada) Limited,Canada,
 75829-0001206774-11-002167,Pall Do Brasil,Brazil,
-75829-0001206774-11-002167,Pall Europe Limited (a),England,
+75829-0001206774-11-002167,Pall Europe Limited,England,
 75829-0001206774-11-002167,Pall France S.A.S.,France,
 75829-0001206774-11-002167,Pall Deutschland Beteiligungs GmbH,Germany,
 75829-0001206774-11-002167,Pall Deutschland Holding GmbH & Co. KG Partnership (c),Germany,
 75829-0001206774-11-002167,Pall Italia S.R.L.,Italy,
 75829-0001206774-11-002167,Pall Manufacturing UK Limited,England,
 75829-0001206774-11-002167,Gelman Ireland Ltd.,Ireland,
-75829-0001206774-11-002167,Pall Netherlands B.V. (a),The Netherlands,
-75829-0001206774-11-002167,PLLN C.V. Partnership (b),The Netherlands,
+75829-0001206774-11-002167,Pall Netherlands B.V.,The Netherlands,
+75829-0001206774-11-002167,PLLN C.V. Partnership,The Netherlands,
 75829-0001206774-11-002167,Pall Norge AS,Norway,
 75829-0001206774-11-002167,Pall Espana S.A.U.,Spain,
 75829-0001206774-11-002167,Pall Norden AB,Sweden,

From 4d29037e502f6b8e88635b17fd12e82aa44810ab Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 8 Oct 2024 18:28:57 -0400
Subject: [PATCH 102/161] Update classifier model

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |   4 +-
 .../exhibit21_layout_classifier.ipynb         | 127 +++++++++++++-----
 2 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 2ecf3c2..a1bd6c3 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -107,11 +107,11 @@
         "mlflow_interface": mlflow_interface_resource,
         "layoutlm_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor",
+            uri="runs:/582fcebbd4cf4d8b8a8f995406ddc560/exhibit21_extractor",
         ),
         "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/08802dbf347c4cd5b66751c11328a06f/exhibit21_layout_classifier",
+            uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier",
         ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
index 584832c..8315fc1 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 1,
    "id": "b4963648-2aac-46a7-9778-8808c1e5eeb2",
    "metadata": {
     "tags": [
@@ -32,9 +32,9 @@
      "output_type": "stream",
      "text": [
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n",
+      "2024-10-08 18:11:22 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n",
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n"
+      "2024-10-08 18:11:22 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n"
      ]
     }
    ],
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 2,
    "id": "ee4ed368-7d01-4cb8-952f-f7941900d669",
    "metadata": {
     "tags": []
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 3,
    "id": "f71e2dfc-552d-49e7-b23d-267c2158efe2",
    "metadata": {
     "tags": []
@@ -118,8 +118,10 @@
    "source": [
     "import numpy as np\n",
     "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
     "\n",
     "X = ex21_layout_classifier_training_dataset.sort_values(by=[\"id\"]).apply(calculate_features, axis=1)\n",
+    "X = StandardScaler().fit_transform(X)\n",
     "y = np.where(ex21_layout_labels.sort_values(by=[\"filename\"])[\"layout\"] == \"Paragraph\", 1, 0)\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)"
    ]
@@ -134,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 4,
    "id": "08bf5f11-af80-4c65-a005-2a2de49c30b5",
    "metadata": {
     "tags": []
@@ -154,7 +156,8 @@
     "    def predict(self, context, model_input: pd.DataFrame):\n",
     "        \"\"\"Create feature matrix from inference dataset and use trained model for prediction.\"\"\"\n",
     "        features_df = model_input.apply(calculate_features, axis=1)\n",
-    "        return self.model.predict(features_df)"
+    "        scaled_features = StandardScaler().fit_transform(features_df)\n",
+    "        return self.model.predict(scaled_features)"
    ]
   },
   {
@@ -167,7 +170,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 6,
    "id": "55d2194e-82a8-4d1e-8318-a8c893dc29de",
    "metadata": {
     "tags": []
@@ -177,17 +180,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/08 16:10:39 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n",
-      "2024/10/08 16:10:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n",
-      "2024/10/08 16:10:40 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
-      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
-      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
-      "\n",
-      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
-      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
-      "Please also refer to the documentation for alternative solver options:\n",
-      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
-      "  n_iter_i = _check_optimize_result(\n",
+      "2024/10/08 18:23:23 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n",
+      "2024/10/08 18:23:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n",
+      "2024/10/08 18:23:24 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
       "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
       "  warnings.warn(\n"
      ]
@@ -195,7 +190,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dd07e390d13f4f6692ae96288ffb1dbb",
+       "model_id": "0fd59c52d9cd47548fa31d3edf451082",
        "version_major": 2,
        "version_minor": 0
       },
@@ -210,17 +205,73 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-shrimp-450 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/08802dbf347c4cd5b66751c11328a06f.\n",
-      "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n",
-      "2024/10/08 16:11:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/10/08 16:11:31 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+      "2024/10/08 18:24:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/5f5d526e1e16442983679d6035599df2.\n",
+      "2024/10/08 18:24:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n",
+      "2024/10/08 18:24:14 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/08 18:24:14 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n",
+      "2024/10/08 18:24:15 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c936d95469b42cdaec2a510caac0e97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/08 18:25:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/84642d0599894058b3ebe85f7f43eab9.\n",
+      "2024/10/08 18:25:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n",
+      "2024/10/08 18:25:05 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/08 18:25:05 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n",
+      "2024/10/08 18:25:06 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "beff1b6195844fdfa6d30048f4164f17",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/08 18:25:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVM at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/cbdd906766b2427c93e9c957be6ea9c8.\n",
+      "2024/10/08 18:25:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n",
+      "2024/10/08 18:25:56 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/08 18:25:57 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
      ]
     }
    ],
    "source": [
     "from dotenv import load_dotenv\n",
     "from mlflow.models import infer_signature\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.svm import SVC\n",
     "\n",
     "from mozilla_sec_eia.library.mlflow import configure_mlflow\n",
     "\n",
@@ -233,18 +284,24 @@
     "# Autolog sklearn model\n",
     "mlflow.autolog()\n",
     "\n",
-    "model = LogisticRegression()\n",
+    "classifiers = {\n",
+    "    \"LogisticRegression\": LogisticRegression(max_iter=500),\n",
+    "    \"RandomForest\": RandomForestClassifier(n_estimators=100),\n",
+    "    \"SVM\": SVC(kernel=\"linear\")\n",
+    "}\n",
     "pyfunc_model = Ex21LayoutClassifier()\n",
-    "with mlflow.start_run():\n",
-    "    model.fit(X_train, y_train)\n",
-    "    model.score(X_test, y_test)\n",
-    "    sklearn_model_uri = mlflow.get_artifact_uri(\"model\")\n",
-    "    mlflow.pyfunc.log_model(\n",
-    "        artifact_path=\"exhibit21_layout_classifier\",\n",
-    "        python_model=pyfunc_model,\n",
-    "        artifacts={\"layout_classifier\": sklearn_model_uri},\n",
-    "        signature=infer_signature(ex21_layout_classifier_training_dataset, y),\n",
-    "    )"
+    "\n",
+    "for classifier, model in classifiers.items():\n",
+    "    with mlflow.start_run(run_name=classifier):\n",
+    "        model.fit(X_train, y_train)\n",
+    "        model.score(X_test, y_test)\n",
+    "        sklearn_model_uri = mlflow.get_artifact_uri(\"model\")\n",
+    "        mlflow.pyfunc.log_model(\n",
+    "            artifact_path=\"exhibit21_layout_classifier\",\n",
+    "            python_model=pyfunc_model,\n",
+    "            artifacts={\"layout_classifier\": sklearn_model_uri},\n",
+    "            signature=infer_signature(ex21_layout_classifier_training_dataset, y),\n",
+    "        )"
    ]
   },
   {

From 85c44ff88981ddf95003fbe72e432b9a9c002930 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 9 Oct 2024 13:04:20 -0400
Subject: [PATCH 103/161] Fix set on copy pandas issue

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |  2 +-
 .../notebooks/exhibit21_extractor.ipynb       | 72 ++++++++++---------
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index a1bd6c3..985d739 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -111,7 +111,7 @@
         ),
         "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier",
+            uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_layout_classifier",
         ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index e155387..1b77029 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -330,7 +330,7 @@
     ")\n",
     "\n",
     "\n",
-    "def separate_entities_by_row(df):\n",
+    "def separate_entities_by_row(entity_df):\n",
     "    \"\"\"Separate entities that span multiple rows and should be distinct.\n",
     "\n",
     "    Sometimes LayoutLM groups multiple entities that span multiple rows\n",
@@ -340,39 +340,34 @@
     "    is greater than the third quantile of y value spacing.\n",
     "    \"\"\"\n",
     "    threshold = 1.0\n",
-    "    for entity in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
-    "        entity_df = df[df[\"pred\"] == entity]\n",
-    "        entity_df[\"line_group\"] = entity_df[\"top_left_y\"].transform(\n",
-    "            lambda y: (y // threshold).astype(int)\n",
-    "        )\n",
-    "        # Get the unique y-values for each line (group) per file\n",
-    "        line_positions = (\n",
-    "            entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n",
-    "        )\n",
-    "        # Calculate the difference between adjacent y-values (i.e., distance between lines)\n",
-    "        line_positions[\"y_diff\"] = line_positions[\"top_left_y\"].diff()\n",
-    "        # Filter out NaN values and take the mean of the valid distances\n",
-    "        y_diffs = line_positions[\"y_diff\"].dropna()\n",
-    "        avg_y_diff = y_diffs.apply(np.floor).mean()\n",
-    "        # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n",
-    "        entity_df[\"prev_y\"] = entity_df[\"top_left_y\"].shift(1)\n",
-    "        entity_df[\"prev_iob\"] = entity_df[\"iob_pred\"].shift(1)\n",
-    "\n",
-    "        # If the current prediction is an I label\n",
-    "        # and y distance exceeds the average y difference\n",
-    "        # update to a B label and make it the start of a new entity\n",
-    "        entity_df[\"iob_pred\"] = np.where(\n",
-    "            (entity_df[\"iob_pred\"].str[0] == \"I\")\n",
-    "            & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n",
-    "            \"B\" + entity_df[\"iob_pred\"].str[1:],  # Update to 'B'\n",
-    "            entity_df[\"iob_pred\"],  # Keep as is\n",
-    "        )\n",
-    "\n",
-    "        # Drop temporary columns\n",
-    "        entity_df = entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n",
-    "        df.update(entity_df, overwrite=True)\n",
+    "    entity_df.loc[:, \"line_group\"] = entity_df.loc[:, \"top_left_y\"].transform(\n",
+    "        lambda y: (y // threshold).astype(int)\n",
+    "    )\n",
+    "    # Get the unique y-values for each line (group) per file\n",
+    "    line_positions = (\n",
+    "        entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n",
+    "    )\n",
+    "    # Calculate the difference between adjacent y-values (i.e., distance between lines)\n",
+    "    line_positions.loc[:, \"y_diff\"] = line_positions.loc[:, \"top_left_y\"].diff()\n",
+    "    # Filter out NaN values and take the mean of the valid distances\n",
+    "    y_diffs = line_positions[\"y_diff\"].dropna()\n",
+    "    avg_y_diff = y_diffs.apply(np.floor).mean()\n",
+    "    # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n",
+    "    entity_df.loc[:, \"prev_y\"] = entity_df.loc[:, \"top_left_y\"].shift(1)\n",
+    "    entity_df.loc[:, \"prev_iob\"] = entity_df.loc[:, \"iob_pred\"].shift(1)\n",
+    "\n",
+    "    # If the current prediction is an I label\n",
+    "    # and y distance exceeds the average y difference\n",
+    "    # update to a B label and make it the start of a new entity\n",
+    "    entity_df.loc[:, \"iob_pred\"] = np.where(\n",
+    "        (entity_df[\"iob_pred\"].str[0] == \"I\")\n",
+    "        & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n",
+    "        \"B\" + entity_df[\"iob_pred\"].str[1:],  # Update to 'B'\n",
+    "        entity_df[\"iob_pred\"],  # Keep as is\n",
+    "    )\n",
     "\n",
-    "    return df\n",
+    "    # Drop temporary columns\n",
+    "    return entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n",
     "\n",
     "class LayoutLMInferencePipeline(Pipeline):\n",
     "    \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n",
@@ -500,8 +495,7 @@
     "        entities_df = df[df[\"pred\"] != \"other\"]\n",
     "        # boxes that have the same group label but are on different rows\n",
     "        # should be updated to have two different B labels\n",
-    "        entities_df = separate_entities_by_row(entities_df)\n",
-    "        # words are labeled with IOB format which stands for inside, outside, beginning\n",
+    "        entities_df = entities_df.groupby([\"pred\"], as_index=False).apply(separate_entities_by_row).reset_index(level=0).sort_index()\n",
     "        # merge B and I entities to form one entity group\n",
     "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
     "        entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
@@ -781,6 +775,14 @@
     "    mlflow.log_table(extracted, \"extracted_data.json\")\n",
     "    mlflow.log_table(metadata, \"extraction_metadata.json\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d11e2a7b-ec74-4930-b331-144a8584c72f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 52e358063073d8d4e4d2dba712ceec0040b2c515 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 9 Oct 2024 14:08:36 -0400
Subject: [PATCH 104/161] Fix model uri's

---
 src/mozilla_sec_eia/models/sec10k/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 985d739..94c643b 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -107,11 +107,11 @@
         "mlflow_interface": mlflow_interface_resource,
         "layoutlm_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/582fcebbd4cf4d8b8a8f995406ddc560/exhibit21_extractor",
+            uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_extractor",
         ),
         "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_layout_classifier",
+            uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier",
         ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")

From b709053f885b68cefd10d4d6bfe5108c15060640 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 9 Oct 2024 16:36:22 -0400
Subject: [PATCH 105/161] Fix indices in extraction model

---
 src/mozilla_sec_eia/models/sec10k/__init__.py |   2 +-
 .../notebooks/exhibit21_extractor.ipynb       | 464 +++++++++++++++++-
 2 files changed, 454 insertions(+), 12 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 94c643b..7d502c1 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -107,7 +107,7 @@
         "mlflow_interface": mlflow_interface_resource,
         "layoutlm_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
-            uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_extractor",
+            uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor",
         ),
         "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource,
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 1b77029..d136a25 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -38,14 +38,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
    "metadata": {
     "tags": [
      "parameters"
     ]
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-09 15:25:02 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n",
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n",
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n",
+      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
+      "2024-10-09 15:25:04 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n"
+     ]
+    }
+   ],
    "source": [
     "import dagstermill\n",
     "\n",
@@ -85,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49",
    "metadata": {
     "tags": []
@@ -156,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "71d205b2-e6ea-4ad0-982c-22e762269119",
    "metadata": {
     "tags": []
@@ -311,7 +326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "42c8e920-d671-40c2-b5db-c43611a33897",
    "metadata": {
     "tags": []
@@ -495,7 +510,9 @@
     "        entities_df = df[df[\"pred\"] != \"other\"]\n",
     "        # boxes that have the same group label but are on different rows\n",
     "        # should be updated to have two different B labels\n",
-    "        entities_df = entities_df.groupby([\"pred\"], as_index=False).apply(separate_entities_by_row).reset_index(level=0).sort_index()\n",
+    "\n",
+    "        entities_df = entities_df.groupby(\"pred\").apply(separate_entities_by_row, include_groups=False)\n",
+    "        entities_df = entities_df.reset_index(\"pred\").sort_index()\n",
     "        # merge B and I entities to form one entity group\n",
     "        # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n",
     "        entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n",
@@ -525,12 +542,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3145df6c447a4f958ac86b7a84c9f52d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/09 15:26:25 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bae0a0244e4141449874b48f750bd443",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
+      "2024/10/09 15:26:54 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a0efe85e59d7401092b6bc7eed6d0bb5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from PIL import Image\n",
     "\n",
@@ -651,7 +727,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
    "metadata": {
     "tags": []
@@ -755,10 +831,376 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/09 15:26:56 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n",
+      "/tmp/ipykernel_168606/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "  lambda group: group.ffill()\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:53: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_validation_set = pd.concat(\n",
+      "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
+      "  padded_compute_set = pd.concat(\n",
+      "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "21da151ecd6d4a9187bf77b40c7a8aed",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
+      "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-snake-419 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/426dd1b67cbd4677b6fa22b6b9d9173a.\n",
+      "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
+      "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
+      "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+     ]
+    }
+   ],
    "source": [
     "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
     "    metadata, extracted = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",

From b8dad3c69d34649c33e84a19f0de164ad5f9dc9b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 9 Oct 2024 17:43:47 -0400
Subject: [PATCH 106/161] Fix typo

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 9a70b74..4e2029c 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -110,7 +110,7 @@ def collect_extracted_chunks(
     extracted_dfs = [df for df in extracted_dfs if not df.empty]
     metadata_df = pd.concat(metadata_dfs)
     extracted_df = pd.concat(extracted_dfs)
-    layout_df = (pd.concat(layout_dfs),)
+    layout_df = pd.concat(layout_dfs)
     return (
         Sec10kExtractionMetadata.validate(metadata_df),
         Ex21CompanyOwnership.validate(extracted_df),

From e6b29ffcfae7c037b5554927d087a42b0bfc80a7 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 10 Oct 2024 16:32:33 -0400
Subject: [PATCH 107/161] Add asset factory for loading models

---
 .../library/mlflow/__init__.py                | 19 +++++++++
 .../library/mlflow/mlflow_io_managers.py      | 24 +++++------
 src/mozilla_sec_eia/models/sec10k/__init__.py | 15 +++----
 .../models/sec10k/ex_21/__init__.py           | 41 +++++++++++++------
 4 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py
index 17a765d..9376d33 100644
--- a/src/mozilla_sec_eia/library/mlflow/__init__.py
+++ b/src/mozilla_sec_eia/library/mlflow/__init__.py
@@ -1,5 +1,8 @@
 """Implement tooling to interface with mlflow experiment tracking."""
 
+from dagster import Config, asset
+from pydantic import create_model
+
 from .mlflow_io_managers import (
     MlflowBaseIOManager,
     MlflowMetricsIOManager,
@@ -13,6 +16,22 @@
 )
 
 
+def pyfunc_model_asset_factory(name: str, mlflow_run_uri: str):
+    """Create asset for loading a model logged to mlflow."""
+    PyfuncConfig = create_model(  # NOQA: N806
+        f"PyfuncConfig{name}", mlflow_run_uri=(str, mlflow_run_uri), __base__=Config
+    )
+
+    @asset(
+        name=name,
+        io_manager_key="pyfunc_model_io_manager",
+    )
+    def _model_asset(config: PyfuncConfig):
+        return config.mlflow_run_uri
+
+    return _model_asset
+
+
 def get_mlflow_io_manager(
     key: str,
     mlflow_interface: MlflowInterface | None = None,
diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
index abc2d1c..fffb424 100644
--- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
+++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py
@@ -31,25 +31,25 @@ class MlflowPyfuncModelIOManager(MlflowBaseIOManager):
 
     uri: str | None = None
 
-    def handle_output(self, context, obj):
-        """Outputs not implemented."""
-        raise NotImplementedError("Logging models not supported by io manager.")
+    def handle_output(self, context: OutputContext, model_uri: str):
+        """Takes model uri as a string and caches the model locally for future use."""
+        cache_path = self.mlflow_interface.dagster_home_path / "model_cache"
+        cache_path.mkdir(exist_ok=True, parents=True)
+
+        logger.info(f"Caching {context.name} model at {cache_path}")
+        mlflow.pyfunc.load_model(
+            model_uri,
+            dst_path=cache_path,
+        )
 
     def load_input(self, context: InputContext):
         """Load pyfunc model with mlflow server."""
         cache_path = (
             self.mlflow_interface.dagster_home_path / "model_cache" / context.name
         )
-        cache_path.mkdir(exist_ok=True, parents=True)
-
-        model_uri = self.uri
-        if model_uri is None:
-            model_uri = f"models:/{context.name}"
+        logger.info(f"Loading {context.name} model from {cache_path}")
 
-        return mlflow.pyfunc.load_model(
-            model_uri,
-            dst_path=cache_path,
-        )
+        return mlflow.pyfunc.load_model(cache_path)
 
 
 class MlflowPandasArtifactIOManager(MlflowBaseIOManager):
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 7d502c1..b01d58f 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -52,7 +52,7 @@
 
 
 exhibit21_extractor = define_dagstermill_asset(
-    name="exhibit21_extractor",
+    name="train_exhibit21_extractor",
     notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"),
     config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(),
     ins={
@@ -71,7 +71,7 @@
 
 
 exhibit21_layout_classifier = define_dagstermill_asset(
-    name="exhibit21_layout_classifier",
+    name="train_exhibit21_layout_classifier",
     notebook_path=file_relative_path(
         __file__, "notebooks/exhibit21_layout_classifier.ipynb"
     ),
@@ -105,17 +105,12 @@
     resources={
         "cloud_interface": cloud_interface_resource,
         "mlflow_interface": mlflow_interface_resource,
-        "layoutlm_io_manager": MlflowPyfuncModelIOManager(
-            mlflow_interface=mlflow_interface_resource,
-            uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor",
-        ),
-        "ex21_classifier_io_manager": MlflowPyfuncModelIOManager(
-            mlflow_interface=mlflow_interface_resource,
-            uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier",
-        ),
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")
         ),
+        "pyfunc_model_io_manager": MlflowPyfuncModelIOManager(
+            mlflow_interface=mlflow_interface_resource
+        ),
         "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
     }
     | mlflow_train_test_io_managers,
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 4e2029c..5a9abc9 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -6,13 +6,14 @@
 import pandas as pd
 from dagster import (
     AssetOut,
-    In,
     Out,
     graph_multi_asset,
     op,
 )
 from mlflow.pyfunc import PyFuncModel
 
+from mozilla_sec_eia.library.mlflow import pyfunc_model_asset_factory
+
 from ..entities import (
     Ex21CompanyOwnership,
     Ex21Layout,
@@ -33,7 +34,6 @@
         "metadata": Out(dagster_type=sec10k_extract_metadata_type),
         "extracted": Out(dagster_type=ex21_extract_type),
     },
-    ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")},
     tags={"model": "exhibit21_extractor"},
 )
 def extract_filing_chunk(
@@ -62,14 +62,7 @@ def extract_filing_chunk(
     return metadata, extracted
 
 
-@op(
-    out={"layout": Out(dagster_type=ex21_layout_type)},
-    ins={
-        "exhibit21_layout_classifier": In(
-            input_manager_key="ex21_classifier_io_manager"
-        )
-    },
-)
+@op(out={"layout": Out(dagster_type=ex21_layout_type)})
 def classify_chunk_layouts(
     parsed_chunk: tuple[pd.DataFrame, pd.DataFrame],
     exhibit21_layout_classifier: PyFuncModel,
@@ -129,6 +122,17 @@ def create_dataset(
     )
 
 
+exhibit21_extractor = pyfunc_model_asset_factory(
+    name="exhibit21_extractor",
+    mlflow_run_uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor",
+)
+
+exhibit21_layout_classifier = pyfunc_model_asset_factory(
+    name="exhibit21_layout_classifier",
+    mlflow_run_uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier",
+)
+
+
 @graph_multi_asset(
     outs={
         "ex21_extraction_metadata": AssetOut(
@@ -143,15 +147,26 @@ def create_dataset(
 )
 def ex21_extract(
     sec10k_filing_metadata: pd.DataFrame,
+    exhibit21_extractor: PyFuncModel,
+    exhibit21_layout_classifier: PyFuncModel,
 ):
     """Extract ownership info from exhibit 21 docs."""
     filing_chunks = chunk_filings(sec10k_filing_metadata)
     parsed_chunks = filing_chunks.map(create_dataset)
-    layout_chunks = parsed_chunks.map(classify_chunk_layouts)
-    metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk)
+    layout_chunks = parsed_chunks.map(
+        lambda chunk: classify_chunk_layouts(chunk, exhibit21_layout_classifier)
+    )
+    metadata_chunks, extracted_chunks = parsed_chunks.map(
+        lambda chunk: extract_filing_chunk(chunk, exhibit21_extractor)
+    )
     return collect_extracted_chunks(
         metadata_chunks.collect(), extracted_chunks.collect(), layout_chunks.collect()
     )
 
 
-production_assets = [sec10k_filing_metadata, ex21_extract]
+production_assets = [
+    sec10k_filing_metadata,
+    ex21_extract,
+    exhibit21_extractor,
+    exhibit21_layout_classifier,
+]

From 3d11777b2c7c6a991d45b6b4cca94f1407a15b47 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 10 Oct 2024 17:50:18 -0400
Subject: [PATCH 108/161] Catch layout classification NaN exception

---
 .../models/sec10k/ex_21/__init__.py           | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 5a9abc9..2e6a13b 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -69,12 +69,22 @@ def classify_chunk_layouts(
 ) -> pd.DataFrame:
     """Extract a set of filings and return results."""
     _, inference_dataset = parsed_chunk
-    return pd.DataFrame(
-        {
-            "filename": inference_dataset["id"],
-            "paragraph": exhibit21_layout_classifier.predict(inference_dataset),
-        }
-    ).set_index("filename")
+    try:
+        df = pd.DataFrame(
+            {
+                "filename": inference_dataset["id"],
+                "paragraph": exhibit21_layout_classifier.predict(inference_dataset),
+            }
+        ).set_index("filename")
+    except ValueError:
+        df = pd.DataFrame(
+            {
+                "filename": inference_dataset["id"],
+                "paragraph": [None] * len(inference_dataset),
+            }
+        ).set_index("filename")
+
+    return df
 
 
 @op(

From df5fe0d8db54162cb054aee94e80be0e85c34261 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Thu, 10 Oct 2024 18:27:17 -0400
Subject: [PATCH 109/161] Use GCS pickle io-manager

---
 pyproject.toml                                | 1 +
 src/mozilla_sec_eia/models/sec10k/__init__.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 8a66d85..9e2e66e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "dagster-mlflow",
     "dagster-webserver",
     "dagster-pandera",
+    "dagster-gcp",
     "dagstermill",
     "datasets>=2.1,<3", # Access Hugging Face datasets
     "seqeval>=1.2,<2", # Sequence labeling evaluation
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index b01d58f..cd91cc4 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -9,6 +9,7 @@
     load_assets_from_modules,
     load_assets_from_package_module,
 )
+from dagster_gcp.gcs import GCSPickleIOManager, GCSResource
 from dagstermill import (
     ConfigurableLocalOutputNotebookIOManager,
     define_dagstermill_asset,
@@ -112,6 +113,11 @@
             mlflow_interface=mlflow_interface_resource
         ),
         "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
+        "io_manager": GCSPickleIOManager(
+            gcs_bucket="sec10k-outputs",
+            gcs_prefix="dagster_storage",
+            gcs=GCSResource(project="catalyst-cooperative-mozilla"),
+        ),
     }
     | mlflow_train_test_io_managers,
 )

From d6c41a2891c9cd56cdb610c5c7e70a38d4ab906a Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 11 Oct 2024 11:26:13 -0400
Subject: [PATCH 110/161] Switch gcs pickle io manager to upath based

---
 .../library/generic_io_managers.py             | 18 ++++++++++++++++++
 src/mozilla_sec_eia/models/sec10k/__init__.py  | 14 +++++++-------
 .../models/sec10k/ex_21/__init__.py            |  8 +++++++-
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/mozilla_sec_eia/library/generic_io_managers.py b/src/mozilla_sec_eia/library/generic_io_managers.py
index e85aa68..7d25198 100644
--- a/src/mozilla_sec_eia/library/generic_io_managers.py
+++ b/src/mozilla_sec_eia/library/generic_io_managers.py
@@ -1,5 +1,7 @@
 """Implement useful generic io-managers."""
 
+import pickle
+
 import pandas as pd
 from dagster import InputContext, OutputContext, UPathIOManager
 from upath import UPath
@@ -19,3 +21,19 @@ def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame:
         """Read parquet."""
         with path.open("rb") as file:
             return pd.read_parquet(file)
+
+
+class PickleUPathIOManager(UPathIOManager):
+    """Read and write pandas dataframes as parquet files on local or remote filesystem."""
+
+    extension: str = ".pickle"
+
+    def dump_to_path(self, context: OutputContext, obj: pd.DataFrame, path: UPath):
+        """Write parquet."""
+        with path.open("wb") as file:
+            pickle.dump(obj, file)
+
+    def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame:
+        """Read parquet."""
+        with path.open("rb") as file:
+            return pickle.load(file)  # noqa: S301
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index cd91cc4..6a33b78 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -9,7 +9,6 @@
     load_assets_from_modules,
     load_assets_from_package_module,
 )
-from dagster_gcp.gcs import GCSPickleIOManager, GCSResource
 from dagstermill import (
     ConfigurableLocalOutputNotebookIOManager,
     define_dagstermill_asset,
@@ -17,7 +16,10 @@
 from upath import UPath
 
 from mozilla_sec_eia.library import model_jobs
-from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager
+from mozilla_sec_eia.library.generic_io_managers import (
+    PandasParquetIOManager,
+    PickleUPathIOManager,
+)
 from mozilla_sec_eia.library.mlflow import (
     MlflowPyfuncModelIOManager,
     mlflow_interface_resource,
@@ -109,15 +111,13 @@
         "pandas_parquet_io_manager": PandasParquetIOManager(
             base_path=UPath("gs://sec10k-outputs/v2")
         ),
+        "pickle_gcs_io_manager": PickleUPathIOManager(
+            base_path=UPath("gs://sec10k-outputs/dagster_storage")
+        ),
         "pyfunc_model_io_manager": MlflowPyfuncModelIOManager(
             mlflow_interface=mlflow_interface_resource
         ),
         "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
-        "io_manager": GCSPickleIOManager(
-            gcs_bucket="sec10k-outputs",
-            gcs_prefix="dagster_storage",
-            gcs=GCSResource(project="catalyst-cooperative-mozilla"),
-        ),
     }
     | mlflow_train_test_io_managers,
 )
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 2e6a13b..3558cd9 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -121,7 +121,13 @@ def collect_extracted_chunks(
     )
 
 
-@op
+@op(
+    out={
+        "dataset": Out(
+            io_manager_key="pickle_gcs_io_manager",
+        ),
+    }
+)
 def create_dataset(
     cloud_interface: GCSArchive, filings: pd.DataFrame
 ) -> tuple[pd.DataFrame, pd.DataFrame]:

From ddd22639bbd6f4aebe8e602e31e508c1cfd83e9e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 11 Oct 2024 12:05:31 -0400
Subject: [PATCH 111/161] Remove duplicate logger

---
 pyproject.toml                                            | 1 -
 src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9e2e66e..8a66d85 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,6 @@ dependencies = [
     "dagster-mlflow",
     "dagster-webserver",
     "dagster-pandera",
-    "dagster-gcp",
     "dagstermill",
     "datasets>=2.1,<3", # Access Hugging Face datasets
     "seqeval>=1.2,<2", # Sequence labeling evaluation
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
index 56a7d9b..25db2d9 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -14,9 +14,6 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-
-
 def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
     """Read and format PDFs into a dataframe (without labels)."""
     inference_df = pd.DataFrame()

From 93bffcbea41ee945b8993b86c55275a8d5d5367f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 11 Oct 2024 12:30:09 -0400
Subject: [PATCH 112/161] Fix config warnings

---
 pyproject.toml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8a66d85..1398915 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,7 +157,7 @@ doctest_optionflags = [
 
 [tool.ruff]
 exclude = ["notebooks/*"]
-select = [
+lint.select = [
     "A", # flake8-builtins
     # "ARG", # unused arguments
     # "B",  # flake8-bugbear
@@ -185,7 +185,7 @@ select = [
     "UP", # pyupgrade (use modern python syntax)
     "W",  # pycodestyle warnings
 ]
-ignore = [
+lint.ignore = [
     "D401",   # Require imperative mood in docstrings.
     "D417",
     "E501",   # Overlong lines.
@@ -205,26 +205,26 @@ target-version = "py311"
 line-length = 88
 
 # Don't automatically concatenate strings -- sometimes we forget a comma!
-unfixable = ["ISC"]
+lint.unfixable = ["ISC"]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401"]  # Ignore unused imports
 "tests/*" = ["D"]
 
-[tool.ruff.pep8-naming]
+[tool.ruff.lint.pep8-naming]
 # Allow Pydantic's `@validator` decorator to trigger class method treatment.
 classmethod-decorators = ["pydantic.validator", "pydantic.root_validator"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["pudl"]
 
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "google"
 
-[tool.ruff.mccabe]
+[tool.ruff.lint.mccabe]
 max-complexity = 10
 
-[tool.ruff.flake8-quotes]
+[tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
 inline-quotes = "double"
 multiline-quotes = "double"

From d717caa72d821a846a961f4c3017e25c8bdb72cf Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Fri, 11 Oct 2024 12:46:20 -0400
Subject: [PATCH 113/161] Test pin sphinx

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1398915..e38d6b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,7 +92,7 @@ dev = [
 docs = [
     "doc8>=1,<2",  # Ensures clean documentation formatting
     "furo>=2022.4.7",
-    "sphinx>=6,<9",  # The default Python documentation engine
+    "sphinx>=6,<8.1",  # The default Python documentation engine
     "sphinx-autoapi>=2,<4",  # Generates documentation from docstrings
     "sphinx-issues>=1.2,<5",  # Allows references to GitHub issues
 

From 09cd18974b69c3eb58e3ca0c7ab8b468fca48764 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sun, 13 Oct 2024 12:59:23 -0700
Subject: [PATCH 114/161] add splink and model to environment

---
 environment.yml | 4 +++-
 pyproject.toml  | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 4a985d5..73986fe 100644
--- a/environment.yml
+++ b/environment.yml
@@ -19,11 +19,13 @@ dependencies:
 
   # Jupyter packages:
   - jupyterlab>=3.2,<4
-  - nbconvert>=6,<7 # Used to clear notebook outputs in pre-commit hooks
+  - nbconvert>=7 # Used to clear notebook outputs in pre-commit hooks
 
   # These are not normal Python packages available on PyPI
   - nodejs # Useful for Jupyter and prettier pre-commit hook
 
+  - catalystcoop.pudl>=2023.2.5,<=2024.8.0
+
   # Use pip to install the package defined by this repo for development:
   - pip:
       - --editable ./[dev,docs,tests,types]
diff --git a/pyproject.toml b/pyproject.toml
index e38d6b8..c38ae7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "pydantic-settings>=2",
     "python-bidi<0.7.0",
     "pymupdf",  # Convert PDF to image
+    "splink>=4,<5",
     "sqlalchemy>=2,<3",
     "timm>0.9,<2", # dependency for Hugging Face computer vision models
     "torch>=2.2,<3",

From 15be127ab15e67db68c50bd9cbb8a8f2272a7382 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 14 Oct 2024 11:37:05 -0400
Subject: [PATCH 115/161] Catch errors while normalizing bounding boxes

---
 .../models/sec10k/ex_21/data/inference.py         | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
index 25db2d9..c84c6db 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 
+from ...entities import Sec10kExtractionMetadata
 from ...utils.cloud import GCSArchive
 from ...utils.pdf import get_image_dict, get_pdf_data_from_path
 from .common import BBOX_COLS_PDF, format_label_studio_output, normalize_bboxes
@@ -17,6 +18,7 @@
 def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
     """Read and format PDFs into a dataframe (without labels)."""
     inference_df = pd.DataFrame()
+    failed_format_metadata = Sec10kExtractionMetadata.example(0)
     for pdf_filename in os.listdir(pdfs_dir):
         if not pdf_filename.endswith(".pdf"):
             continue
@@ -26,9 +28,16 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
         txt = extracted["pdf_text"]
         pg_meta = extracted["page"]
         # normalize bboxes between 0 and 1000 for Hugging Face
-        txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
-        txt.loc[:, "id"] = filename
-        inference_df = pd.concat([inference_df, txt])
+        try:
+            txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)
+            txt.loc[:, "id"] = filename
+            inference_df = pd.concat([inference_df, txt])
+        except KeyError:
+            logger.warning(f"Failed to normalize bounding boxes for filing: {filename}")
+            failed_format_metadata.loc[filename, ["success", "notes"]] = [
+                False,
+                "Failed to normalize bounding boxes",
+            ]
     return inference_df
 
 

From 4117d0ae161811d03e800e6a0098b707b43702cc Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 14 Oct 2024 11:40:01 -0400
Subject: [PATCH 116/161] Fix call to pandera example

---
 src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
index c84c6db..def88fd 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -18,7 +18,7 @@
 def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
     """Read and format PDFs into a dataframe (without labels)."""
     inference_df = pd.DataFrame()
-    failed_format_metadata = Sec10kExtractionMetadata.example(0)
+    failed_format_metadata = Sec10kExtractionMetadata.example(size=0)
     for pdf_filename in os.listdir(pdfs_dir):
         if not pdf_filename.endswith(".pdf"):
             continue

From 8c8dd602f586e85cd44c6ddb9f998f32879195e0 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 14 Oct 2024 11:52:10 -0400
Subject: [PATCH 117/161] Fix handle failures in converting to pdf

---
 .../models/sec10k/ex_21/data/inference.py          | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
index def88fd..e2498d8 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -38,7 +38,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path):
                 False,
                 "Failed to normalize bounding boxes",
             ]
-    return inference_df
+    return inference_df, failed_format_metadata
 
 
 def _cache_pdfs(
@@ -100,8 +100,18 @@ def create_inference_dataset(
                 labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir
             )
         else:
-            inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir)
+            inference_df, failed_format_metadata = format_unlabeled_pdf_dataframe(
+                pdfs_dir=pdfs_dir
+            )
+            extraction_metadata = pd.concat(
+                [extraction_metadata, failed_format_metadata]
+            )
         image_dict = get_image_dict(pdfs_dir)
+        image_dict = {
+            filename: image
+            for filename, image in image_dict.items()
+            if filename not in extraction_metadata
+        }
 
     annotations = []
     for filename, image in image_dict.items():

From ff821b56b964338f3152d3971a67c19f0eb2ade6 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Mon, 14 Oct 2024 11:58:08 -0400
Subject: [PATCH 118/161] Actually fix handle failures in converting to pdf

---
 src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
index e2498d8..8e6b661 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py
@@ -110,7 +110,7 @@ def create_inference_dataset(
         image_dict = {
             filename: image
             for filename, image in image_dict.items()
-            if filename not in extraction_metadata
+            if filename not in extraction_metadata.index
         }
 
     annotations = []

From a8eb359b0f156c8344c2a9896db2e125ecb6864b Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 16 Oct 2024 12:55:50 -0400
Subject: [PATCH 119/161] Add model documentation to sec10k readme

---
 src/mozilla_sec_eia/models/sec10k/README.rst | 56 ++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst
index ffecf28..c5f94f4 100644
--- a/src/mozilla_sec_eia/models/sec10k/README.rst
+++ b/src/mozilla_sec_eia/models/sec10k/README.rst
@@ -3,6 +3,62 @@ sec10k: Extracting company ownership data from sec10k documents
 
 This repo contains exploratory development for an SEC-EIA linkage.
 
+Models
+------
+Basic 10k
+^^^^^^^^^
+The extraction model for basic 10k company information is very simple and requires no
+training. This model is implemented as a simple rules based parser that finds key-value
+pairs containing company information, which is embedded in a header for all 10k filings.
+
+Exhibit 21
+^^^^^^^^^^
+Exhibit 21 extraction is much more complicated and requires pretrained models that are
+cached with our mlflow tracking server. Currently, there are 2 models which are
+implemented in the ``notebooks/`` directory. These notebooks use
+`Dagstermill <https://docs.dagster.io/integrations/dagstermill/using-notebooks-with-dagster>`_
+so they can be run interactively like any normal Jupyter Notebook, or run in a Dagster
+job.
+
+Extraction
+""""""""""
+The primary extraction model is implemented in the ``notebooks/exhibit21_extractor.ipynb``.
+This model is based on
+`layoutlm <https://huggingface.co/microsoft/layoutlmv3-base>`_ with custom inference logic
+to construct a table of ownership information from an exhibit 21 document. Both the
+layoutlm model and the inference model are logged separately with mlflow. This
+separation between the models allows for testing minor modifications to the inference
+portion with the same pretrained layoutlm model.
+
+There are currently two configuration parameters that used by the extraction model
+notebook:
+
+* ``layoutlm_training_run``: This should be an existing mlflow run name, which was used
+  to train layoutlm, and has a logged model associated with it. If ``None`` layoutlm
+  will be trained when the notebook is run, and the new training run will be used for
+  inference and validation.
+* ``training_data_version``: This should point to a GCS folder containing training
+  data to use with layoutlm. If ``layoutlm_training_run`` is set, then this parameter
+  doesn't matter, as layoutlm will not be re-trained when the notebook is executed.
+
+The notebook also depends on several upstream dagster assets, which produce training and
+validation datasets. Using upstream assets allows these datasets, which are relatively
+expensive to produce, to be easily cached and reused while interating on the model.
+These upstream assets need to be produced before the notebook can be run. They should
+also be re-materialized if you want to modify the training or validation data, otherwise
+the notebook can be re-run as many times as desired with existing data.
+
+Layout Classification
+"""""""""""""""""""""
+The second model is a classifier, which labels filings as either having a 'paragraph'
+layout or not. This is done because the extraction model performs poorly on documents
+formatted as paragraphs rather than tables. For now we will likely just filter out these
+results, but we could also develop a separate extraction model which handles these
+documents better.
+
+This model also depends on upstream assets to produce training data, which will need
+to be produced before running the notebook.
+
 Usage
 -----
 

From dc160ac21641fcd8e74e654b1d1ab97abb6f8d4f Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 16 Oct 2024 17:16:33 -0400
Subject: [PATCH 120/161] Fix ex 21 validation integration test

---
 .../sec10k/ex_21/ex21_validation_helpers.py   |  84 +++++++++
 .../notebooks/exhibit21_extractor.ipynb       | 165 +++---------------
 .../integration/models/sec10k/extract_test.py |  37 ++--
 3 files changed, 136 insertions(+), 150 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
index fca7168..0b530a9 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py
@@ -3,9 +3,93 @@
 import numpy as np
 import pandas as pd
 
+from mozilla_sec_eia.library import validation_helpers
 from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename
 
 
+def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):
+    """Compute validation metrics for Ex. 21 extraction."""
+    shared_cols = validation_df.columns.intersection(computed_df.columns)
+    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)
+    # strip llc and other company name parts for the similarity comparison
+    computed_df["subsidiary"] = validation_helpers.strip_down_company_names(
+        computed_df["subsidiary"]
+    )
+    validation_df["subsidiary"] = validation_helpers.strip_down_company_names(
+        validation_df["subsidiary"]
+    )
+    n_equal = 0
+    validation_filenames = validation_df["id"].unique()
+    n_files = len(validation_filenames)
+    table_metrics_dict = {}
+    jaccard_dict = {}
+    incorrect_files = []
+    # iterate through each file and check each extracted table
+    for filename in validation_filenames:
+        extracted_table_df = computed_df[computed_df["id"] == filename].reset_index(
+            drop=True
+        )
+        validation_table_df = validation_df[
+            validation_df["id"] == filename
+        ].reset_index(drop=True)
+        # check if the tables are exactly equal
+        if extracted_table_df[["subsidiary", "loc", "own_per"]].equals(
+            validation_table_df[["subsidiary", "loc", "own_per"]]
+        ):
+            n_equal += 1
+        else:
+            incorrect_files.append(filename)
+        # compute jaccard sim + precision and recall for each column
+        table_metrics_dict[filename] = {}
+        jaccard_dict[filename] = {}
+        for col in ["subsidiary", "loc", "own_per"]:
+            extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison(
+                extracted_table_df[col]
+            )
+            validation_table_df[col] = validation_helpers.fill_nulls_for_comparison(
+                validation_table_df[col]
+            )
+            table_prec_recall = validation_helpers.pandas_compute_precision_recall(
+                extracted_table_df, validation_table_df, value_col=col
+            )
+            table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[
+                "precision"
+            ]
+            table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"]
+            # get the jaccard similarity between columns
+            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(
+                computed_df=extracted_table_df,
+                validation_df=validation_table_df,
+                value_col=col,
+            )
+
+    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index()
+    prec_recall_df = pd.DataFrame.from_dict(
+        table_metrics_dict, orient="index"
+    ).reset_index()
+
+    return (
+        jaccard_df,
+        prec_recall_df,
+        pd.DataFrame({"filename": incorrect_files}),
+        {
+            "table_accuracy": n_equal / n_files,
+            "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files,
+            "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files,
+            "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files,
+            "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum()
+            / n_files,
+            "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files,
+            "avg_own_per_precision": prec_recall_df["own_per_precision"].sum()
+            / n_files,
+            "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum()
+            / n_files,
+            "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files,
+            "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files,
+        },
+    )
+
+
 def clean_extracted_df(extracted_df):
     """Perform basic cleaning on a dataframe extracted from an Ex. 21."""
     if extracted_df.empty:
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index d136a25..53e16c8 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -51,13 +51,13 @@
      "output_type": "stream",
      "text": [
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-09 15:25:02 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n",
+      "2024-10-16 17:11:06 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n",
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n",
+      "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n",
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n",
+      "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n",
       "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-09 15:25:04 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n"
+      "2024-10-16 17:11:15 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n"
      ]
     }
    ],
@@ -113,7 +113,6 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
-    "from mozilla_sec_eia.library import validation_helpers\n",
     "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n",
     "\n",
     "\n",
@@ -326,7 +325,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "id": "42c8e920-d671-40c2-b5db-c43611a33897",
    "metadata": {
     "tags": []
@@ -542,7 +541,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
    "metadata": {
     "tags": []
@@ -551,7 +550,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3145df6c447a4f958ac86b7a84c9f52d",
+       "model_id": "d0779d02915a4503b0cd92d3df38cf88",
        "version_major": 2,
        "version_minor": 0
       },
@@ -566,13 +565,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/09 15:26:25 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
+      "2024/10/16 17:11:20 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bae0a0244e4141449874b48f750bd443",
+       "model_id": "601bb4ae91dd4a218fe5be047f4829d0",
        "version_major": 2,
        "version_minor": 0
       },
@@ -587,15 +586,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
-      "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
-      "2024/10/09 15:26:54 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
+      "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
+      "2024/10/16 17:11:51 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a0efe85e59d7401092b6bc7eed6d0bb5",
+       "model_id": "68b8d5cef3a94294b243b6f0c3e8ee5f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -710,120 +709,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0bd74bdc-bb63-4ad2-82ec-3dfcf93a6121",
-   "metadata": {},
-   "source": [
-    "#### Load validation data\n",
-    "Next, load an inference dataset containing validation data. This dataset is formatted exactly the same as those that will feed into the `Ex21Extractor` during a production run, but contain only data from the validation set. When creating inference datasets we also produce a metadata dataframe documenting any filings that couldn't be parsed/converted to a PDF. This dataframe should be empty for the validation set, but we will still load it for consistency with production runs."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eddcc912-324a-42e9-9841-3a916c6ece6b",
-   "metadata": {},
-   "source": [
-    "Next define method method for computing validation metrics. The metrics computed above for training are looking at bounding boxes output by `layoutlm` and pertain to one word at a time. These metrics will look at an entire table produced the inference pipeline and compare to the validation data. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf",
+   "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
-   "source": [
-    "from mlflow.models import infer_signature\n",
-    "\n",
-    "\n",
-    "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n",
-    "    \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n",
-    "    shared_cols = validation_df.columns.intersection(computed_df.columns)\n",
-    "    validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n",
-    "    # strip llc and other company name parts for the similarity comparison\n",
-    "    computed_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n",
-    "        computed_df[\"subsidiary\"]\n",
-    "    )\n",
-    "    validation_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n",
-    "        validation_df[\"subsidiary\"]\n",
-    "    )\n",
-    "    n_equal = 0\n",
-    "    validation_filenames = validation_df[\"id\"].unique()\n",
-    "    n_files = len(validation_filenames)\n",
-    "    table_metrics_dict = {}\n",
-    "    jaccard_dict = {}\n",
-    "    incorrect_files = []\n",
-    "    # iterate through each file and check each extracted table\n",
-    "    for filename in validation_filenames:\n",
-    "        extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n",
-    "            drop=True\n",
-    "        )\n",
-    "        validation_table_df = validation_df[\n",
-    "            validation_df[\"id\"] == filename\n",
-    "        ].reset_index(drop=True)\n",
-    "        # check if the tables are exactly equal\n",
-    "        if extracted_table_df[[\"subsidiary\", \"loc\", \"own_per\"]].equals(\n",
-    "            validation_table_df[[\"subsidiary\", \"loc\", \"own_per\"]]\n",
-    "        ):\n",
-    "            n_equal += 1\n",
-    "        else:\n",
-    "            incorrect_files.append(filename)\n",
-    "        # compute jaccard sim + precision and recall for each column\n",
-    "        table_metrics_dict[filename] = {}\n",
-    "        jaccard_dict[filename] = {}\n",
-    "        for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n",
-    "            extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n",
-    "                extracted_table_df[col]\n",
-    "            )\n",
-    "            validation_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n",
-    "                validation_table_df[col]\n",
-    "            )\n",
-    "            table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n",
-    "                extracted_table_df, validation_table_df, value_col=col\n",
-    "            )\n",
-    "            table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n",
-    "                \"precision\"\n",
-    "            ]\n",
-    "            table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n",
-    "            # get the jaccard similarity between columns\n",
-    "            jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n",
-    "                computed_df=extracted_table_df,\n",
-    "                validation_df=validation_table_df,\n",
-    "                value_col=col,\n",
-    "            )\n",
-    "\n",
-    "    jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n",
-    "    prec_recall_df = pd.DataFrame.from_dict(\n",
-    "        table_metrics_dict, orient=\"index\"\n",
-    "    ).reset_index()\n",
-    "\n",
-    "    return (\n",
-    "        jaccard_df,\n",
-    "        prec_recall_df,\n",
-    "        pd.DataFrame({\"filename\": incorrect_files}),\n",
-    "        {\n",
-    "            \"table_accuracy\": n_equal / n_files,\n",
-    "            \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n",
-    "            \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n",
-    "            \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n",
-    "            \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n",
-    "            \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n",
-    "            / n_files,\n",
-    "            \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n",
-    "            \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n",
-    "        },\n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea",
-   "metadata": {},
    "source": [
     "#### Validate model\n",
     "Finally, run the full model on the validation set and log metrics to mlflow. The logged metrics/model will appear in a nested run below the training run used for the current version of the model."
@@ -831,15 +720,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "dfb56470-8527-424c-a9e5-4135e55fde4d",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/09 15:26:56 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
+      "2024/10/16 17:11:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n",
       "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
       "  warnings.warn(\n",
       "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
@@ -960,7 +851,7 @@
       "  warnings.warn(\n",
       "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
       "  warnings.warn(\n",
-      "/tmp/ipykernel_168606/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "/tmp/ipykernel_48762/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
       "  lambda group: group.ffill()\n",
       "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n",
       "  padded_compute_set = pd.concat(\n",
@@ -1177,7 +1068,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "21da151ecd6d4a9187bf77b40c7a8aed",
+       "model_id": "db36592620c244479123275dfc464648",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1192,16 +1083,18 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
-      "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
-      "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-snake-419 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/426dd1b67cbd4677b6fa22b6b9d9173a.\n",
-      "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n",
-      "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n",
-      "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n"
+      "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
+      "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n"
      ]
     }
    ],
    "source": [
+    "from mlflow.models import infer_signature\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n",
+    "    ex21_validation_metrics,\n",
+    ")\n",
+    "\n",
     "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
     "    metadata, extracted = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",
     "    metadata = pd.concat([ex21_failed_parsing_metadata, metadata])\n",
diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py
index 87636f1..38a795a 100644
--- a/tests/integration/models/sec10k/extract_test.py
+++ b/tests/integration/models/sec10k/extract_test.py
@@ -2,9 +2,10 @@
 
 import logging
 import os
-import unittest
 
 import dotenv
+import mlflow
+from dagster import materialize_to_memory
 
 from mozilla_sec_eia.library.mlflow import configure_mlflow
 from mozilla_sec_eia.library.mlflow.mlflow_resource import get_most_recent_run
@@ -41,20 +42,28 @@ def test_ex21_validation(
         os.getenv("MLFLOW_TRACKING_URI"),
         os.getenv("GCS_PROJECT"),
     )
-    pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm(
-        cache_path=tmp_path
+
+    # Load validation data
+    result = materialize_to_memory(
+        [
+            sec10k.ex_21.data.ex21_validation_set,
+            sec10k.ex_21.data.ex21_validation_filing_metadata,
+            sec10k.ex_21.data.ex21_inference_dataset,
+        ],
+        resources={"cloud_interface": sec10k.utils.GCSArchive()},
+    )
+    ex21_inference_dataset = result.output_for_node(
+        "ex21_inference_dataset", output_name="ex21_inference_dataset"
     )
+    ex21_validation_set = result.output_for_node("ex21_validation_set")
 
-    with unittest.mock.patch(
-        "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm",
-        new=lambda cache_path, version: pretrained_model,
-    ):
-        set_test_mlflow_env_vars_factory()
-        result = sec10k.defs.get_job_def(
-            "ex21_extraction_validation"
-        ).execute_in_process()
+    # Load latest version of pretrained model
+    pretrained_model = mlflow.pyfunc.load_model("models:/exhibit21_extractor/latest")
+    _, extracted = pretrained_model.predict(ex21_inference_dataset)
 
-    run = get_most_recent_run("ex21_extraction_validation", result.run_id)
+    _, _, _, metrics = sec10k.ex_21.ex21_validation_helpers.ex21_validation_metrics(
+        extracted, ex21_validation_set
+    )
 
-    assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85
-    assert run.data.metrics["avg_location_jaccard_sim"] > 0.83
+    assert metrics["avg_subsidiary_jaccard_sim"] > 0.85
+    assert metrics["avg_location_jaccard_sim"] > 0.83

From 10b24a92cc092e64c4c8631b357d34ec49565680 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 16 Oct 2024 18:33:07 -0400
Subject: [PATCH 121/161] Improve classifier error handling

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 3558cd9..367c2e2 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -76,7 +76,7 @@ def classify_chunk_layouts(
                 "paragraph": exhibit21_layout_classifier.predict(inference_dataset),
             }
         ).set_index("filename")
-    except ValueError:
+    except ValueError | KeyError:
         df = pd.DataFrame(
             {
                 "filename": inference_dataset["id"],

From ad549794b7357da8d3d089e9c025b55f0302030a Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 16 Oct 2024 18:38:59 -0400
Subject: [PATCH 122/161] Fully broaden classifier errors

---
 src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
index 367c2e2..5b7ac07 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py
@@ -69,6 +69,9 @@ def classify_chunk_layouts(
 ) -> pd.DataFrame:
     """Extract a set of filings and return results."""
     _, inference_dataset = parsed_chunk
+    if inference_dataset.empty:
+        return Ex21Layout.example(size=0)
+
     try:
         df = pd.DataFrame(
             {
@@ -76,7 +79,8 @@ def classify_chunk_layouts(
                 "paragraph": exhibit21_layout_classifier.predict(inference_dataset),
             }
         ).set_index("filename")
-    except ValueError | KeyError:
+    except Exception:
+        logger.warning(traceback.format_exc())
         df = pd.DataFrame(
             {
                 "filename": inference_dataset["id"],

From 672e123a84070454ef23a307f5b56c6058486be2 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 16 Oct 2024 19:23:32 -0400
Subject: [PATCH 123/161] add more docs on running the notebooks

---
 src/mozilla_sec_eia/models/sec10k/README.rst | 26 +++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst
index c5f94f4..13afc39 100644
--- a/src/mozilla_sec_eia/models/sec10k/README.rst
+++ b/src/mozilla_sec_eia/models/sec10k/README.rst
@@ -56,9 +56,33 @@ formatted as paragraphs rather than tables. For now we will likely just filter o
 results, but we could also develop a separate extraction model which handles these
 documents better.
 
-This model also depends on upstream assets to produce training data, which will need
+This model is located in ``notebooks/exhibit21_layout_classifier.ipynb``, and it also
+depends on upstream assets to produce training data, which will need
 to be produced before running the notebook.
 
+Training the Models
+"""""""""""""""""""
+The models are trained by running the notebooks. This can be done either interactively
+like a normal notebook or through dagster directly.
+
+Whether running interactively or with dagster, you will first need to produce the
+upstream data assets:
+
+1. Launch dagster from the repo root with the ``dagster dev`` command
+2. Locate the training Job in question using the webui
+3. Select the upstream assets by holding down the shift key and clicking on each
+   asset excluding the notebook asset
+4. Click ``Materialize all`` in the UI
+
+Once this is complete, you can simply launch ``Jupyter`` and run the notebooks
+interactively as you would any other notebook. The first cell loads the upstream
+assets and sets configuration. You can modify the configuration directly in the
+notebook as normal.
+
+To run the notebook in dagster, you simply execute it like any other normal asset.
+You can first set configuration in the dagster launchpad if desired, and when it
+completes executing, you can click on the asset to view the fully rendered notebook.
+
 Usage
 -----
 

From b95b2fb9d6664e929be7d2ee93c2b5722ea46ebe Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 21 Oct 2024 16:43:31 -0700
Subject: [PATCH 124/161] add splink notebooks and preprocessing functions

---
 environment.yml                               |    9 +-
 notebooks/16-kl-splink-ex21-filer-link.ipynb  | 3507 +++++++++++++++++
 .../17-kl-paragraph-layout-metrics.ipynb      |  687 ++++
 notebooks/18-kl-splink-sec-eia.ipynb          | 3326 ++++++++++++++++
 pyproject.toml                                |   12 +-
 .../models/sec_eia_record_linkage/__init__.py |    1 +
 .../sec_eia_record_linkage/preprocessing.py   |  288 ++
 7 files changed, 7822 insertions(+), 8 deletions(-)
 create mode 100644 notebooks/16-kl-splink-ex21-filer-link.ipynb
 create mode 100644 notebooks/17-kl-paragraph-layout-metrics.ipynb
 create mode 100644 notebooks/18-kl-splink-sec-eia.ipynb
 create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
 create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py

diff --git a/environment.yml b/environment.yml
index 73986fe..3ad1cd4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,7 +5,7 @@ channels:
 dependencies:
   # Packages required for setting up the environment
   - pip>=21,<24
-  - python>=3.10,<3.12
+  - python>=3.10,<=3.12
   - setuptools>=66,<69
 
   # Packages specified in setup.py that need or benefit from binary conda packages
@@ -24,8 +24,11 @@ dependencies:
   # These are not normal Python packages available on PyPI
   - nodejs # Useful for Jupyter and prettier pre-commit hook
 
-  - catalystcoop.pudl>=2023.2.5,<=2024.8.0
+  - dask>=2024
+  - gdal
 
   # Use pip to install the package defined by this repo for development:
   - pip:
-      - --editable ./[dev,docs,tests,types]
+    - git+https://github.com/catalyst-cooperative/pudl.git@main
+    # - -e /Users/katielamb/CatalystCoop/pudl[dev,docs,tests,types]
+    - --editable ./[dev,docs,tests,types]
diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb
new file mode 100644
index 0000000..2e656d3
--- /dev/null
+++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb
@@ -0,0 +1,3507 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c535d97b-5dfa-4298-87f5-55c56c4c82ed",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "e1222c94-36cd-4bae-95fb-089e5411e490",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from upath import UPath\n",
+    "\n",
+    "# from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive\n",
+    "# from pudl.analysis.record_linkage import name_cleaner\n",
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16cd6122-4cb9-42aa-8be1-84c997a34e96",
+   "metadata": {},
+   "source": [
+    "# Read in Inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "67da3bf4-abbd-40c2-850b-1c73953625c8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "28bdfdfd-beeb-4097-b4d3-b58a7c30f64d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia_df = raw_eia_df.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "ee54bb48-cbe4-4261-9545-d4b2bdcb731e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8e69b4ba-8e7b-4d17-bc8c-a06f059f6015",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ce60f760-5b94-4889-92c5-ac0ed5cd6d82",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "missing_utils = raw_eia861_df[~raw_eia861_df.utility_id_eia.isin(raw_eia_df.utility_id_eia.unique())].utility_id_eia.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "a3ef2365-e459-44b3-94b0-77020cd606f2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "harvested_df = pd.concat([\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "59fd9d69-b700-43ec-bb7a-f99eea1e0ec9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a47d17c1-0df1-412f-9687-3d540266f005",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n",
+    "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n",
+    "                how=\"left\", \n",
+    "                left_on=[\"report_date\", \"utility_name_eia\"],\n",
+    "                right_on=[\"report_date\", \"new_parent\"]\n",
+    "               )\n",
+    "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n",
+    "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "fa6515b1-5012-4ec0-af96-f9fda11a9c5d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>state</th>\n",
+       "      <th>utility_name_eia</th>\n",
+       "      <th>new_parent</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>city</th>\n",
+       "      <th>merge_state</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>29933</th>\n",
+       "      <td>2009-01-01</td>\n",
+       "      <td>17698</td>\n",
+       "      <td>LA</td>\n",
+       "      <td>Southwestern Electric Power Co</td>\n",
+       "      <td>Southwestern Electric Power Co</td>\n",
+       "      <td>1 Riverside Plaza</td>\n",
+       "      <td>Columbus</td>\n",
+       "      <td>OH</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33258</th>\n",
+       "      <td>2010-01-01</td>\n",
+       "      <td>17698</td>\n",
+       "      <td>AR</td>\n",
+       "      <td>Southwestern Electric Power Co</td>\n",
+       "      <td>Southwestern Electric Power Co</td>\n",
+       "      <td>1 Riverside Plaza</td>\n",
+       "      <td>Columbus</td>\n",
+       "      <td>OH</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>49001</th>\n",
+       "      <td>2015-01-01</td>\n",
+       "      <td>11788</td>\n",
+       "      <td>IA</td>\n",
+       "      <td>Consumers Energy</td>\n",
+       "      <td>Consumers Energy</td>\n",
+       "      <td>One Enrgy Plaza</td>\n",
+       "      <td>Jackson</td>\n",
+       "      <td>MI</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>56853</th>\n",
+       "      <td>2017-01-01</td>\n",
+       "      <td>19157</td>\n",
+       "      <td>IA</td>\n",
+       "      <td>MiEnergy Cooperative</td>\n",
+       "      <td>MiEnergy Cooperative</td>\n",
+       "      <td>31110 Cooperative Way</td>\n",
+       "      <td>Rushford</td>\n",
+       "      <td>MN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>70820</th>\n",
+       "      <td>2021-01-01</td>\n",
+       "      <td>40165</td>\n",
+       "      <td>AZ</td>\n",
+       "      <td>Dixie Escalante R E A, Inc</td>\n",
+       "      <td>Dixie Escalante R E A, Inc</td>\n",
+       "      <td>495 N 3200 W</td>\n",
+       "      <td>Flowell</td>\n",
+       "      <td>UT</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      report_date  utility_id_eia state                utility_name_eia                      new_parent         street_address      city merge_state\n",
+       "29933  2009-01-01           17698    LA  Southwestern Electric Power Co  Southwestern Electric Power Co      1 Riverside Plaza  Columbus          OH\n",
+       "33258  2010-01-01           17698    AR  Southwestern Electric Power Co  Southwestern Electric Power Co      1 Riverside Plaza  Columbus          OH\n",
+       "49001  2015-01-01           11788    IA                Consumers Energy                Consumers Energy        One Enrgy Plaza   Jackson          MI\n",
+       "56853  2017-01-01           19157    IA            MiEnergy Cooperative            MiEnergy Cooperative  31110 Cooperative Way  Rushford          MN\n",
+       "70820  2021-01-01           40165    AZ      Dixie Escalante R E A, Inc      Dixie Escalante R E A, Inc           495 N 3200 W   Flowell          UT"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eia861_df[(eia861_df.state != eia861_df.merge_state) & (eia861_df.merge_state.notna())]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8ff7b788-5fef-4e88-94ff-89b25619aed8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "17885342-b464-4f4d-ac75-b7be4d4ec7cc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "fb71f68d-92da-468b-b8a5-02f5ba4b4459",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia_df = pd.concat([eia_df, eia861_df])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "85402523-e28a-4410-b933-eb71572b9a00",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "94e824d6-dd6a-47db-9447-3363e8d14fe0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# not sure at what point this stops being a datetime\n",
+    "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "56857668-ecd5-4c62-9286-e50c334750c5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# there are nulls from non harvested 861 utilities\n",
+    "eia_df = eia_df.dropna(subset=\"utility_name_eia\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# for now try just training on 2023\n",
+    "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n",
+    "                       ]\n",
+    "                      )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
+    "raw_sec_df.columns.name = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "8e7a642d-7718-4101-b851-f1f4ee07180e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_ex21_df = pd.DataFrame()\n",
+    "for file in ex21_path.iterdir():\n",
+    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
+    "        report_year = file.name[:4]\n",
+    "        # for now just train with 2023\n",
+    "        if report_year != \"2023\":\n",
+    "            continue\n",
+    "        year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = report_year\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
+    "        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "7daad7a6-c590-4324-9e31-2bb5c9fa4d6c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>utility_id_pudl</th>\n",
+       "      <th>utility_name_eia</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>plants_reported_owner</th>\n",
+       "      <th>plants_reported_operator</th>\n",
+       "      <th>...</th>\n",
+       "      <th>contact_lastname</th>\n",
+       "      <th>contact_title</th>\n",
+       "      <th>phone_number</th>\n",
+       "      <th>phone_extension</th>\n",
+       "      <th>contact_firstname_2</th>\n",
+       "      <th>contact_lastname_2</th>\n",
+       "      <th>contact_title_2</th>\n",
+       "      <th>phone_number_2</th>\n",
+       "      <th>phone_extension_2</th>\n",
+       "      <th>data_maturity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>66292</td>\n",
+       "      <td>16386.0</td>\n",
+       "      <td>Desert Willow Energy Storage</td>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>100 Bayview Circle</td>\n",
+       "      <td>Newport Beach</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>provisional</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>66291</td>\n",
+       "      <td>16385.0</td>\n",
+       "      <td>Portage Solar Plant</td>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>N8917</td>\n",
+       "      <td>Portage</td>\n",
+       "      <td>WI</td>\n",
+       "      <td>53901</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>provisional</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>66290</td>\n",
+       "      <td>16384.0</td>\n",
+       "      <td>NSF Energy One LLC</td>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>1241 University Ave</td>\n",
+       "      <td>Rochester</td>\n",
+       "      <td>NY</td>\n",
+       "      <td>14607</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>provisional</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    utility_id_eia  utility_id_pudl              utility_name_eia report_date  \\\n",
+       "33           66292          16386.0  Desert Willow Energy Storage  2023-01-01   \n",
+       "35           66291          16385.0           Portage Solar Plant  2023-01-01   \n",
+       "37           66290          16384.0            NSF Energy One LLC  2023-01-01   \n",
+       "\n",
+       "         street_address           city state zip_code plants_reported_owner  \\\n",
+       "33   100 Bayview Circle  Newport Beach    CA     None                  None   \n",
+       "35                N8917        Portage    WI    53901                  None   \n",
+       "37  1241 University Ave      Rochester    NY    14607                  None   \n",
+       "\n",
+       "   plants_reported_operator  ... contact_lastname contact_title phone_number  \\\n",
+       "33                     None  ...             None          None         None   \n",
+       "35                     None  ...             None          None         None   \n",
+       "37                     None  ...             None          None         None   \n",
+       "\n",
+       "   phone_extension contact_firstname_2 contact_lastname_2 contact_title_2  \\\n",
+       "33            None                None               None            None   \n",
+       "35            None                None               None            None   \n",
+       "37            None                None               None            None   \n",
+       "\n",
+       "   phone_number_2 phone_extension_2 data_maturity  \n",
+       "33           None              None   provisional  \n",
+       "35           None              None   provisional  \n",
+       "37           None              None   provisional  \n",
+       "\n",
+       "[3 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eia_df[(eia_df.street_address.notnull())].head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "30c02757-45c0-403c-aa38-7422d3549a2b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia_subset = eia_df[eia_df.report_date == \"2020-01-01\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "1c0365a3-51d2-455b-8863-bc4dc22572f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>utility_id_pudl</th>\n",
+       "      <th>utility_name_eia</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>plants_reported_owner</th>\n",
+       "      <th>plants_reported_operator</th>\n",
+       "      <th>...</th>\n",
+       "      <th>contact_lastname</th>\n",
+       "      <th>contact_title</th>\n",
+       "      <th>phone_number</th>\n",
+       "      <th>phone_extension</th>\n",
+       "      <th>contact_firstname_2</th>\n",
+       "      <th>contact_lastname_2</th>\n",
+       "      <th>contact_title_2</th>\n",
+       "      <th>phone_number_2</th>\n",
+       "      <th>phone_extension_2</th>\n",
+       "      <th>data_maturity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>71566</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2010-01-01</td>\n",
+       "      <td>P O Box 1006</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28202</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71568</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2008-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71569</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2007-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Ashcraft</td>\n",
+       "      <td>Sr. Engineering Technologist</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Robert</td>\n",
+       "      <td>Mc Murry</td>\n",
+       "      <td>Dir Carolinas Integrated Resou</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71570</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2006-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Duckworth</td>\n",
+       "      <td>Planning Engineer</td>\n",
+       "      <td>704-382-4327</td>\n",
+       "      <td>382</td>\n",
+       "      <td>Steven</td>\n",
+       "      <td>Jester</td>\n",
+       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
+       "      <td>704-382-4887</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71571</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2005-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Duckworth</td>\n",
+       "      <td>Planning Engineer</td>\n",
+       "      <td>704-382-4327</td>\n",
+       "      <td>382</td>\n",
+       "      <td>Steven</td>\n",
+       "      <td>Jester</td>\n",
+       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
+       "      <td>704-382-4887</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71572</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2004-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Duckworth</td>\n",
+       "      <td>Planning Engineer</td>\n",
+       "      <td>704-382-4327</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Steven</td>\n",
+       "      <td>Jester</td>\n",
+       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
+       "      <td>704-382-4887</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71573</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2003-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Duckworth</td>\n",
+       "      <td>Process Leader</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Steven</td>\n",
+       "      <td>Jester</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71574</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2002-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Scott Henry</td>\n",
+       "      <td>Process Leader</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Mgr Reg Policy $ Res</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>71575</th>\n",
+       "      <td>5416</td>\n",
+       "      <td>90.0</td>\n",
+       "      <td>Duke Energy Corp</td>\n",
+       "      <td>2001-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Charlotte</td>\n",
+       "      <td>NC</td>\n",
+       "      <td>28201</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>R S Henry</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Mgr Operating Plann &amp; Analysis</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>9 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       utility_id_eia  utility_id_pudl  utility_name_eia report_date  \\\n",
+       "71566            5416             90.0  Duke Energy Corp  2010-01-01   \n",
+       "71568            5416             90.0  Duke Energy Corp  2008-01-01   \n",
+       "71569            5416             90.0  Duke Energy Corp  2007-01-01   \n",
+       "71570            5416             90.0  Duke Energy Corp  2006-01-01   \n",
+       "71571            5416             90.0  Duke Energy Corp  2005-01-01   \n",
+       "71572            5416             90.0  Duke Energy Corp  2004-01-01   \n",
+       "71573            5416             90.0  Duke Energy Corp  2003-01-01   \n",
+       "71574            5416             90.0  Duke Energy Corp  2002-01-01   \n",
+       "71575            5416             90.0  Duke Energy Corp  2001-01-01   \n",
+       "\n",
+       "      street_address       city state zip_code plants_reported_owner  \\\n",
+       "71566   P O Box 1006  Charlotte    NC    28202                  None   \n",
+       "71568           None  Charlotte    NC    28201                  None   \n",
+       "71569           None  Charlotte    NC    28201                  None   \n",
+       "71570           None  Charlotte    NC    28201                  None   \n",
+       "71571           None  Charlotte    NC    28201                  None   \n",
+       "71572           None  Charlotte    NC    28201                  None   \n",
+       "71573           None  Charlotte    NC    28201                  None   \n",
+       "71574           None  Charlotte    NC    28201                  None   \n",
+       "71575           None  Charlotte    NC    28201                  None   \n",
+       "\n",
+       "      plants_reported_operator  ... contact_lastname  \\\n",
+       "71566                     None  ...             None   \n",
+       "71568                     None  ...             None   \n",
+       "71569                     None  ...         Ashcraft   \n",
+       "71570                     None  ...        Duckworth   \n",
+       "71571                     None  ...        Duckworth   \n",
+       "71572                     None  ...        Duckworth   \n",
+       "71573                     None  ...        Duckworth   \n",
+       "71574                     None  ...      Scott Henry   \n",
+       "71575                     None  ...        R S Henry   \n",
+       "\n",
+       "                      contact_title  phone_number phone_extension  \\\n",
+       "71566                          None          None            None   \n",
+       "71568                          None          None            None   \n",
+       "71569  Sr. Engineering Technologist          None            None   \n",
+       "71570             Planning Engineer  704-382-4327             382   \n",
+       "71571             Planning Engineer  704-382-4327             382   \n",
+       "71572             Planning Engineer  704-382-4327               0   \n",
+       "71573                Process Leader          None               0   \n",
+       "71574                Process Leader          None               0   \n",
+       "71575                          None          None               0   \n",
+       "\n",
+       "      contact_firstname_2 contact_lastname_2                 contact_title_2  \\\n",
+       "71566                None               None                            None   \n",
+       "71568                None               None                            None   \n",
+       "71569              Robert           Mc Murry  Dir Carolinas Integrated Resou   \n",
+       "71570              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
+       "71571              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
+       "71572              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
+       "71573              Steven             Jester                            None   \n",
+       "71574                None               None            Mgr Reg Policy $ Res   \n",
+       "71575                None               None  Mgr Operating Plann & Analysis   \n",
+       "\n",
+       "      phone_number_2 phone_extension_2 data_maturity  \n",
+       "71566           None              None         final  \n",
+       "71568           None              None         final  \n",
+       "71569           None              None         final  \n",
+       "71570   704-382-4887              None         final  \n",
+       "71571   704-382-4887              None         final  \n",
+       "71572   704-382-4887              None         final  \n",
+       "71573           None              None         final  \n",
+       "71574           None              None         final  \n",
+       "71575           None              None         final  \n",
+       "\n",
+       "[9 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eia_df[(eia_df.utility_name_eia.str.contains(\"Duke Energy Corp\")) & (eia_df.state == \"NC\")].drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a",
+   "metadata": {},
+   "source": [
+    "# Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "39706c77-90db-4f49-8011-47a9777a88b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_df = prepare_ex21_df(raw_ex21_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "11caf325-8530-430d-a3d2-a54043447021",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sec_df has filename as unique ID\n",
+    "sec_df.filename.is_unique"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236",
+   "metadata": {},
+   "source": [
+    "Note: not removing paragraph layout docs, but maybe should"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Match Ex. 21 Subsidiaries to a SEC filer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01d3a5e1-ad17-4266-b2ef-358f246749db",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "4df63893-8a18-4b00-9b16-d036108bd567",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>state</th>\n",
+       "      <th>state_of_incorporation</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ny</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>ny</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>ca</td>\n",
+       "      <td>md</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>ga</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>nj</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8265</th>\n",
+       "      <td>ny</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8266</th>\n",
+       "      <td>tx</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8267</th>\n",
+       "      <td>ny</td>\n",
+       "      <td>oh</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8268</th>\n",
+       "      <td>tx</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8269</th>\n",
+       "      <td>ct</td>\n",
+       "      <td>de</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5051 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     state state_of_incorporation\n",
+       "1       ny                     de\n",
+       "2       ny                     de\n",
+       "5       ca                     md\n",
+       "6       ga                     de\n",
+       "7       nj                     de\n",
+       "...    ...                    ...\n",
+       "8265    ny                     de\n",
+       "8266    tx                     de\n",
+       "8267    ny                     oh\n",
+       "8268    tx                     de\n",
+       "8269    ct                     de\n",
+       "\n",
+       "[5051 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_df[(sec_df[\"state\"] != sec_df[\"state_of_incorporation\"]) & (~sec_df[\"state_of_incorporation\"].isnull())][[\"state\", \"state_of_incorporation\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "24890018-8efb-445f-ad91-ca316edccbe8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_match_df = sec_df.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "83f859df-1764-4e97-addc-0064bdcb31b7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "loc_of_incorporation\n",
+       "False    6359\n",
+       "True      748\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 87,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_match_df[\"loc_of_incorporation\"].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "ex21_match_df = ex21_df.copy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef3f01c7-c21e-4755-ac99-4ea01f359c43",
+   "metadata": {},
+   "source": [
+    "Remove clearly \"invalid\" strings and fill nulls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "company_name\n",
+       "rush truck center                                           120\n",
+       "encompass health rehabilitation hospital                     79\n",
+       "rush peterbilt truck center                                  57\n",
+       "branch                                                       52\n",
+       "sci funeral services, llc iowa limited liability company     33\n",
+       "partnership limited partnership                              32\n",
+       "alderwoods group, llc de limited liability company           27\n",
+       "encompass health rehabilitation hospital of                  26\n",
+       "u haul co. of                                                26\n",
+       "at&t                                                         25\n",
+       "corporation                                                  21\n",
+       "amh portfolio management                                     20\n",
+       "rush bus center                                              20\n",
+       "limited partnership limited partnership                      18\n",
+       "rapy limited partnership                                     15\n",
+       "rush isuzu trucks                                            15\n",
+       "colgate palmolive limited                                    14\n",
+       "ecolab limited                                               11\n",
+       "rush truck centres                                           11\n",
+       "johnson and johnson limited                                  11\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex21_match_df.company_name.value_counts().head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "8a4839e5-a2e5-4098-826a-4d340cdde638",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "ex21_match_df = ex21_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]\n",
+    "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c294372b-159c-4c90-a031-61c34532b965",
+   "metadata": {},
+   "source": [
+    "## Exploratory Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from splink.exploratory import completeness_chart, profile_columns\n",
+    "from splink import DuckDBAPI\n",
+    "\n",
+    "db_api = DuckDBAPI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "id": "422ca098-e4e7-4284-8b04-74e976e36023",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "match_cols = [\"report_year\", \"company_name\", \"loc_of_incorporation\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "232b5718-c1ed-4e63-8384-b4acf33210d3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed details,\n",
+       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-08289b57cb7a9ca1ff1da3e2ddccde42\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-08289b57cb7a9ca1ff1da3e2ddccde42\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 192164, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 1, \"total_rows_inc_nulls\": 192164, \"completeness\": 0.9999948143959045}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 18779, \"total_rows_inc_nulls\": 192164, \"completeness\": 0.9022761583328247}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.LayerChart(...)"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sometimes this will show up as 100% non null in loc_of_incorporation, not sure why\n",
+    "completeness_chart(ex21_match_df[match_cols], db_api=db_api)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "520a9b86",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed details,\n",
+       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-0fe59c00c8af4561818cd26f7b170021\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-0fe59c00c8af4561818cd26f7b170021\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-0fe59c00c8af4561818cd26f7b170021\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-0ad83db79741c92ff59a5e8e4b65695b\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-0ad83db79741c92ff59a5e8e4b65695b\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7107, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7107, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 748, \"total_rows_inc_nulls\": 7107, \"completeness\": 0.8947516679763794}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.LayerChart(...)"
+      ]
+     },
+     "execution_count": 94,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completeness_chart(sec_match_df[match_cols], db_api=db_api)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b6b20bc-cd22-42cc-b24d-8d581a311ca8",
+   "metadata": {},
+   "source": [
+    "There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed details,\n",
+       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 192164, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 192164.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 192164, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 192164.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 192164, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 192164, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 192164]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9993755221366882, \"percentile_inc_nulls\": 0.9993755221366882, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9989644289016724, \"percentile_inc_nulls\": 0.9989644289016724, \"value_count\": 79, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9986677765846252, \"percentile_inc_nulls\": 0.9986677765846252, \"value_count\": 57, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.998397171497345, \"percentile_inc_nulls\": 0.9983972311019897, \"value_count\": 52, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9982254505157471, \"percentile_inc_nulls\": 0.9982254505157471, \"value_count\": 33, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9980589151382446, \"percentile_inc_nulls\": 0.9980589747428894, \"value_count\": 32, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.997918426990509, \"percentile_inc_nulls\": 0.997918426990509, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 27.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9976478219032288, \"percentile_inc_nulls\": 0.9976478219032288, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9975177049636841, \"percentile_inc_nulls\": 0.9975177645683289, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9974084496498108, \"percentile_inc_nulls\": 0.9974084496498108, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9972003102302551, \"percentile_inc_nulls\": 0.9972003102302551, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9971066117286682, \"percentile_inc_nulls\": 0.9971066117286682, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9969505071640015, \"percentile_inc_nulls\": 0.9969505071640015, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9968776702880859, \"percentile_inc_nulls\": 0.9968776702880859, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.996705949306488, \"percentile_inc_nulls\": 0.996705949306488, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9963936805725098, \"percentile_inc_nulls\": 0.9963936805725098, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9961127042770386, \"percentile_inc_nulls\": 0.9961127042770386, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9957379698753357, \"percentile_inc_nulls\": 0.9957380294799805, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9952279925346375, \"percentile_inc_nulls\": 0.9952280521392822, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9941352009773254, \"percentile_inc_nulls\": 0.9941352009773254, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9931464195251465, \"percentile_inc_nulls\": 0.9931464791297913, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9913563132286072, \"percentile_inc_nulls\": 0.9913563132286072, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9881871342658997, \"percentile_inc_nulls\": 0.9881871938705444, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 609.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9721382260322571, \"percentile_inc_nulls\": 0.9721384048461914, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3084.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 5.185604095458984e-06, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 186809.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 1 values (0.0%) are null and there are 188768 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 120, \"group_name\": \"_company_name_\", \"value\": \"rush truck center\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 79, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 57, \"group_name\": \"_company_name_\", \"value\": \"rush peterbilt truck center\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 52, \"group_name\": \"_company_name_\", \"value\": \"branch\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 33, \"group_name\": \"_company_name_\", \"value\": \"sci funeral services, llc iowa limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 32, \"group_name\": \"_company_name_\", \"value\": \"partnership limited partnership\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 27, \"group_name\": \"_company_name_\", \"value\": \"alderwoods group, llc de limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"u haul co. of\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital of\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 25, \"group_name\": \"_company_name_\", \"value\": \"at&t\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico global mobility, limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico global real estate, incorporated\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico group finance international b.v\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico logistyka sp. z o.o\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico y limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 120]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.032447993755340576, \"percentile_inc_nulls\": 0.1270008683204651, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03099459409713745, \"percentile_inc_nulls\": 0.12568950653076172, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02970266342163086, \"percentile_inc_nulls\": 0.1245238184928894, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02812814712524414, \"percentile_inc_nulls\": 0.12310320138931274, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.026121079921722412, \"percentile_inc_nulls\": 0.12129223346710205, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02349686622619629, \"percentile_inc_nulls\": 0.11892443895339966, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.020636141300201416, \"percentile_inc_nulls\": 0.11634331941604614, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.01705455780029297, \"percentile_inc_nulls\": 0.11311173439025879, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.01166766881942749, \"percentile_inc_nulls\": 0.10825127363204956, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 934.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.09772384166717529, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2023.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.6351702809333801, \"percentile_inc_nulls\": 0.6708228588104248, \"value_count\": 63256, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 63256.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.6025030612945557, \"percentile_inc_nulls\": 0.6413480043411255, \"value_count\": 5664, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 5664.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5844392776489258, \"percentile_inc_nulls\": 0.6250494718551636, \"value_count\": 3132, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3132.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5671367049217224, \"percentile_inc_nulls\": 0.6094377636909485, \"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3000.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5508146286010742, \"percentile_inc_nulls\": 0.5947107672691345, \"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2830.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5346598625183105, \"percentile_inc_nulls\": 0.5801346898078918, \"value_count\": 2801, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2801.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5185166001319885, \"percentile_inc_nulls\": 0.5655689835548401, \"value_count\": 2799, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2799.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5036594867706299, \"percentile_inc_nulls\": 0.5521637797355652, \"value_count\": 2576, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2576.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.489102303981781, \"percentile_inc_nulls\": 0.5390291213989258, \"value_count\": 2524, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2524.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4755774736404419, \"percentile_inc_nulls\": 0.526826024055481, \"value_count\": 2345, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2345.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.46293509006500244, \"percentile_inc_nulls\": 0.5154191255569458, \"value_count\": 2192, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2192.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.450356125831604, \"percentile_inc_nulls\": 0.504069447517395, \"value_count\": 2181, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2181.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.43872886896133423, \"percentile_inc_nulls\": 0.49357837438583374, \"value_count\": 2016, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2016.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4272860884666443, \"percentile_inc_nulls\": 0.4832538962364197, \"value_count\": 1984, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1984.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4172448515892029, \"percentile_inc_nulls\": 0.4741939306259155, \"value_count\": 1741, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1741.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4076130986213684, \"percentile_inc_nulls\": 0.46550339460372925, \"value_count\": 1670, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1670.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3980448246002197, \"percentile_inc_nulls\": 0.4568701982498169, \"value_count\": 1659, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1659.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.38943392038345337, \"percentile_inc_nulls\": 0.4491007924079895, \"value_count\": 1493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1493.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.38099604845046997, \"percentile_inc_nulls\": 0.44148749113082886, \"value_count\": 1463, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1463.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.372806191444397, \"percentile_inc_nulls\": 0.43409794569015503, \"value_count\": 1420, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1420.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.36476051807403564, \"percentile_inc_nulls\": 0.4268385171890259, \"value_count\": 1395, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1395.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.35703206062316895, \"percentile_inc_nulls\": 0.41986531019210815, \"value_count\": 1340, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1340.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3493266701698303, \"percentile_inc_nulls\": 0.41291290521621704, \"value_count\": 1336, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1336.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3420711159706116, \"percentile_inc_nulls\": 0.40636640787124634, \"value_count\": 1258, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1258.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.33516740798950195, \"percentile_inc_nulls\": 0.40013736486434937, \"value_count\": 1197, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1197.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.32851743698120117, \"percentile_inc_nulls\": 0.39413732290267944, \"value_count\": 1153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1153.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3222770094871521, \"percentile_inc_nulls\": 0.3885067105293274, \"value_count\": 1082, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1082.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.31607115268707275, \"percentile_inc_nulls\": 0.38290733098983765, \"value_count\": 1076, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.30991148948669434, \"percentile_inc_nulls\": 0.3773495554924011, \"value_count\": 1068, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.30378061532974243, \"percentile_inc_nulls\": 0.37181782722473145, \"value_count\": 1063, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2978919744491577, \"percentile_inc_nulls\": 0.3665046691894531, \"value_count\": 1021, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1021.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2920668125152588, \"percentile_inc_nulls\": 0.3612487316131592, \"value_count\": 1010, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2864896059036255, \"percentile_inc_nulls\": 0.35621654987335205, \"value_count\": 967, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 967.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2809297442436218, \"percentile_inc_nulls\": 0.35120004415512085, \"value_count\": 964, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 964.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2754044532775879, \"percentile_inc_nulls\": 0.3462147116661072, \"value_count\": 958, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 958.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.27058857679367065, \"percentile_inc_nulls\": 0.3418694734573364, \"value_count\": 835, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 835.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2658880352973938, \"percentile_inc_nulls\": 0.3376283049583435, \"value_count\": 815, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 815.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.26138365268707275, \"percentile_inc_nulls\": 0.33356404304504395, \"value_count\": 781, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 781.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.25697147846221924, \"percentile_inc_nulls\": 0.3295830488204956, \"value_count\": 765, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 765.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.252588152885437, \"percentile_inc_nulls\": 0.3256281018257141, \"value_count\": 760, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.248210608959198, \"percentile_inc_nulls\": 0.3216783404350281, \"value_count\": 759, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 759.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.24416184425354004, \"percentile_inc_nulls\": 0.31802523136138916, \"value_count\": 702, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.24016493558883667, \"percentile_inc_nulls\": 0.3144189119338989, \"value_count\": 693, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 693.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2362315058708191, \"percentile_inc_nulls\": 0.31086987257003784, \"value_count\": 682, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 682.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.23252874612808228, \"percentile_inc_nulls\": 0.3075289726257324, \"value_count\": 642, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 642.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.22886639833450317, \"percentile_inc_nulls\": 0.30422449111938477, \"value_count\": 635, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 635.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.22521555423736572, \"percentile_inc_nulls\": 0.3009304404258728, \"value_count\": 633, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 633.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2216973900794983, \"percentile_inc_nulls\": 0.2977560758590698, \"value_count\": 610, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 610.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21819651126861572, \"percentile_inc_nulls\": 0.294597327709198, \"value_count\": 607, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 607.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21485131978988647, \"percentile_inc_nulls\": 0.29157906770706177, \"value_count\": 580, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21168500185012817, \"percentile_inc_nulls\": 0.2887221574783325, \"value_count\": 549, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 549.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20855897665023804, \"percentile_inc_nulls\": 0.28590160608291626, \"value_count\": 542, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 542.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20568102598190308, \"percentile_inc_nulls\": 0.2833048701286316, \"value_count\": 499, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 499.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20283758640289307, \"percentile_inc_nulls\": 0.28073936700820923, \"value_count\": 493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 493.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20003461837768555, \"percentile_inc_nulls\": 0.27821028232574463, \"value_count\": 486, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 486.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.19467079639434814, \"percentile_inc_nulls\": 0.2733706831932068, \"value_count\": 465, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 930.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.19212734699249268, \"percentile_inc_nulls\": 0.2710757255554199, \"value_count\": 441, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 441.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1896357536315918, \"percentile_inc_nulls\": 0.2688276767730713, \"value_count\": 432, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 432.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18717306852340698, \"percentile_inc_nulls\": 0.2666056156158447, \"value_count\": 427, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18473917245864868, \"percentile_inc_nulls\": 0.26440954208374023, \"value_count\": 422, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 422.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18231678009033203, \"percentile_inc_nulls\": 0.2622239589691162, \"value_count\": 420, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17991751432418823, \"percentile_inc_nulls\": 0.260059118270874, \"value_count\": 416, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17752975225448608, \"percentile_inc_nulls\": 0.25790470838546753, \"value_count\": 414, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1751651167869568, \"percentile_inc_nulls\": 0.25577110052108765, \"value_count\": 410, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17281192541122437, \"percentile_inc_nulls\": 0.25364792346954346, \"value_count\": 408, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16838252544403076, \"percentile_inc_nulls\": 0.24965131282806396, \"value_count\": 384, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 768.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16626006364822388, \"percentile_inc_nulls\": 0.24773633480072021, \"value_count\": 368, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1642356514930725, \"percentile_inc_nulls\": 0.24590975046157837, \"value_count\": 351, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16222858428955078, \"percentile_inc_nulls\": 0.24409878253936768, \"value_count\": 348, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15823745727539062, \"percentile_inc_nulls\": 0.24049770832061768, \"value_count\": 346, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 692.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15429246425628662, \"percentile_inc_nulls\": 0.2369382381439209, \"value_count\": 342, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15233731269836426, \"percentile_inc_nulls\": 0.23517411947250366, \"value_count\": 339, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15039938688278198, \"percentile_inc_nulls\": 0.23342561721801758, \"value_count\": 336, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14853650331497192, \"percentile_inc_nulls\": 0.23174476623535156, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14669668674468994, \"percentile_inc_nulls\": 0.23008471727371216, \"value_count\": 319, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 319.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1448625922203064, \"percentile_inc_nulls\": 0.2284298539161682, \"value_count\": 318, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14124059677124023, \"percentile_inc_nulls\": 0.2251618504524231, \"value_count\": 314, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 628.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.13776856660842896, \"percentile_inc_nulls\": 0.22202908992767334, \"value_count\": 301, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 602.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.13604408502578735, \"percentile_inc_nulls\": 0.22047311067581177, \"value_count\": 299, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 299.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1343311071395874, \"percentile_inc_nulls\": 0.2189275622367859, \"value_count\": 297, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1309167742729187, \"percentile_inc_nulls\": 0.21584689617156982, \"value_count\": 296, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12926721572875977, \"percentile_inc_nulls\": 0.21435856819152832, \"value_count\": 286, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12764078378677368, \"percentile_inc_nulls\": 0.2128911018371582, \"value_count\": 282, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12603163719177246, \"percentile_inc_nulls\": 0.21143919229507446, \"value_count\": 279, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1245032548904419, \"percentile_inc_nulls\": 0.21006017923355103, \"value_count\": 265, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12307292222976685, \"percentile_inc_nulls\": 0.20876961946487427, \"value_count\": 248, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12023532390594482, \"percentile_inc_nulls\": 0.20620930194854736, \"value_count\": 246, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11886841058731079, \"percentile_inc_nulls\": 0.20497596263885498, \"value_count\": 237, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11756497621536255, \"percentile_inc_nulls\": 0.20379990339279175, \"value_count\": 226, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11626726388931274, \"percentile_inc_nulls\": 0.20262902975082397, \"value_count\": 225, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11371803283691406, \"percentile_inc_nulls\": 0.20032888650894165, \"value_count\": 221, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11124950647354126, \"percentile_inc_nulls\": 0.19810163974761963, \"value_count\": 214, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10880410671234131, \"percentile_inc_nulls\": 0.19589519500732422, \"value_count\": 212, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10758715867996216, \"percentile_inc_nulls\": 0.19479715824127197, \"value_count\": 211, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10640484094619751, \"percentile_inc_nulls\": 0.19373035430908203, \"value_count\": 205, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10524553060531616, \"percentile_inc_nulls\": 0.1926843523979187, \"value_count\": 201, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1040920615196228, \"percentile_inc_nulls\": 0.1916435956954956, \"value_count\": 200, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10294431447982788, \"percentile_inc_nulls\": 0.19060802459716797, \"value_count\": 199, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10180234909057617, \"percentile_inc_nulls\": 0.1895776391029358, \"value_count\": 198, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10066616535186768, \"percentile_inc_nulls\": 0.18855249881744385, \"value_count\": 197, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09953570365905762, \"percentile_inc_nulls\": 0.18753254413604736, \"value_count\": 196, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09730947017669678, \"percentile_inc_nulls\": 0.18552380800247192, \"value_count\": 193, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09518700838088989, \"percentile_inc_nulls\": 0.1836087703704834, \"value_count\": 184, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09414887428283691, \"percentile_inc_nulls\": 0.18267208337783813, \"value_count\": 180, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09315109252929688, \"percentile_inc_nulls\": 0.18177181482315063, \"value_count\": 173, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09216481447219849, \"percentile_inc_nulls\": 0.18088197708129883, \"value_count\": 171, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09119009971618652, \"percentile_inc_nulls\": 0.18000251054763794, \"value_count\": 169, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09025001525878906, \"percentile_inc_nulls\": 0.17915427684783936, \"value_count\": 163, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 163.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08933299779891968, \"percentile_inc_nulls\": 0.17832684516906738, \"value_count\": 159, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08842170238494873, \"percentile_inc_nulls\": 0.17750465869903564, \"value_count\": 158, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08751624822616577, \"percentile_inc_nulls\": 0.1766875982284546, \"value_count\": 157, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08663380146026611, \"percentile_inc_nulls\": 0.17589139938354492, \"value_count\": 153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08580905199050903, \"percentile_inc_nulls\": 0.17514729499816895, \"value_count\": 143, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08501309156417847, \"percentile_inc_nulls\": 0.17442911863327026, \"value_count\": 138, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08423453569412231, \"percentile_inc_nulls\": 0.1737266182899475, \"value_count\": 135, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08347898721694946, \"percentile_inc_nulls\": 0.17304491996765137, \"value_count\": 131, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08272916078567505, \"percentile_inc_nulls\": 0.17236840724945068, \"value_count\": 130, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08198517560958862, \"percentile_inc_nulls\": 0.17169708013534546, \"value_count\": 129, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08124691247940063, \"percentile_inc_nulls\": 0.17103099822998047, \"value_count\": 128, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08052021265029907, \"percentile_inc_nulls\": 0.1703752875328064, \"value_count\": 126, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07908987998962402, \"percentile_inc_nulls\": 0.16908472776412964, \"value_count\": 124, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07838046550750732, \"percentile_inc_nulls\": 0.16844463348388672, \"value_count\": 123, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07767683267593384, \"percentile_inc_nulls\": 0.16780978441238403, \"value_count\": 122, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07697898149490356, \"percentile_inc_nulls\": 0.1671801209449768, \"value_count\": 121, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07629263401031494, \"percentile_inc_nulls\": 0.1665608286857605, \"value_count\": 119, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07563513517379761, \"percentile_inc_nulls\": 0.16596758365631104, \"value_count\": 114, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07498341798782349, \"percentile_inc_nulls\": 0.1653795838356018, \"value_count\": 113, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07434898614883423, \"percentile_inc_nulls\": 0.16480714082717896, \"value_count\": 110, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07373183965682983, \"percentile_inc_nulls\": 0.16425031423568726, \"value_count\": 107, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07253223657608032, \"percentile_inc_nulls\": 0.16316789388656616, \"value_count\": 104, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07195550203323364, \"percentile_inc_nulls\": 0.162647545337677, \"value_count\": 100, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0713844895362854, \"percentile_inc_nulls\": 0.16213232278823853, \"value_count\": 99, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07081925868988037, \"percentile_inc_nulls\": 0.16162234544754028, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07026559114456177, \"percentile_inc_nulls\": 0.16112279891967773, \"value_count\": 96, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06971770524978638, \"percentile_inc_nulls\": 0.16062843799591064, \"value_count\": 95, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06918132305145264, \"percentile_inc_nulls\": 0.16014444828033447, \"value_count\": 93, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06815469264984131, \"percentile_inc_nulls\": 0.1592181921005249, \"value_count\": 89, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06764715909957886, \"percentile_inc_nulls\": 0.15876024961471558, \"value_count\": 88, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06714534759521484, \"percentile_inc_nulls\": 0.1583074927330017, \"value_count\": 87, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06666088104248047, \"percentile_inc_nulls\": 0.157870352268219, \"value_count\": 84, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06618797779083252, \"percentile_inc_nulls\": 0.15744364261627197, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06573230028152466, \"percentile_inc_nulls\": 0.1570325493812561, \"value_count\": 79, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06528246402740479, \"percentile_inc_nulls\": 0.1566266417503357, \"value_count\": 78, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06483834981918335, \"percentile_inc_nulls\": 0.15622591972351074, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0644230842590332, \"percentile_inc_nulls\": 0.15585124492645264, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06361567974090576, \"percentile_inc_nulls\": 0.15512269735336304, \"value_count\": 70, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06284278631210327, \"percentile_inc_nulls\": 0.15442538261413574, \"value_count\": 67, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06246793270111084, \"percentile_inc_nulls\": 0.1540871262550354, \"value_count\": 65, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06175273656845093, \"percentile_inc_nulls\": 0.15344184637069702, \"value_count\": 62, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06140094995498657, \"percentile_inc_nulls\": 0.1531243920326233, \"value_count\": 61, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06036275625228882, \"percentile_inc_nulls\": 0.15218770503997803, \"value_count\": 60, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06002247333526611, \"percentile_inc_nulls\": 0.15188068151474, \"value_count\": 59, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05868440866470337, \"percentile_inc_nulls\": 0.15067338943481445, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05835568904876709, \"percentile_inc_nulls\": 0.15037673711776733, \"value_count\": 57, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05803269147872925, \"percentile_inc_nulls\": 0.15008533000946045, \"value_count\": 56, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05708104372024536, \"percentile_inc_nulls\": 0.14922672510147095, \"value_count\": 55, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.056769609451293945, \"percentile_inc_nulls\": 0.14894568920135498, \"value_count\": 54, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05647546052932739, \"percentile_inc_nulls\": 0.14868026971817017, \"value_count\": 51, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.056192874908447266, \"percentile_inc_nulls\": 0.14842528104782104, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05536234378814697, \"percentile_inc_nulls\": 0.14767593145370483, \"value_count\": 48, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.055091261863708496, \"percentile_inc_nulls\": 0.1474313735961914, \"value_count\": 47, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05483752489089966, \"percentile_inc_nulls\": 0.14720237255096436, \"value_count\": 44, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05409348011016846, \"percentile_inc_nulls\": 0.1465311050415039, \"value_count\": 43, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.053366780281066895, \"percentile_inc_nulls\": 0.14587539434432983, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05242091417312622, \"percentile_inc_nulls\": 0.1450219750404358, \"value_count\": 41, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05195951461791992, \"percentile_inc_nulls\": 0.1446056365966797, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05150961875915527, \"percentile_inc_nulls\": 0.14419972896575928, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05107128620147705, \"percentile_inc_nulls\": 0.14380425214767456, \"value_count\": 38, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.050431132316589355, \"percentile_inc_nulls\": 0.14322662353515625, \"value_count\": 37, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.050223469734191895, \"percentile_inc_nulls\": 0.1430392861366272, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04943913221359253, \"percentile_inc_nulls\": 0.1423315405845642, \"value_count\": 34, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.049081504344940186, \"percentile_inc_nulls\": 0.14200890064239502, \"value_count\": 31, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04873549938201904, \"percentile_inc_nulls\": 0.14169669151306152, \"value_count\": 30, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04806643724441528, \"percentile_inc_nulls\": 0.14109301567077637, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04774343967437744, \"percentile_inc_nulls\": 0.14080160856246948, \"value_count\": 28, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04712057113647461, \"percentile_inc_nulls\": 0.14023959636688232, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04667067527770996, \"percentile_inc_nulls\": 0.13983368873596191, \"value_count\": 26, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04566138982772827, \"percentile_inc_nulls\": 0.13892298936843872, \"value_count\": 25, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04538452625274658, \"percentile_inc_nulls\": 0.13867324590682983, \"value_count\": 24, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04445600509643555, \"percentile_inc_nulls\": 0.13783538341522217, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.043567776679992676, \"percentile_inc_nulls\": 0.13703399896621704, \"value_count\": 22, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.043204426765441895, \"percentile_inc_nulls\": 0.1367061734199524, \"value_count\": 21, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04285836219787598, \"percentile_inc_nulls\": 0.13639390468597412, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04187214374542236, \"percentile_inc_nulls\": 0.13550406694412231, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04093778133392334, \"percentile_inc_nulls\": 0.1346610188484192, \"value_count\": 18, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.040153443813323975, \"percentile_inc_nulls\": 0.1339532732963562, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.039415180683135986, \"percentile_inc_nulls\": 0.1332871913909912, \"value_count\": 16, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03829050064086914, \"percentile_inc_nulls\": 0.13227242231369019, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03716009855270386, \"percentile_inc_nulls\": 0.1312524676322937, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.036185383796691895, \"percentile_inc_nulls\": 0.1303730010986328, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03473198413848877, \"percentile_inc_nulls\": 0.12906163930892944, \"value_count\": 12, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03371685743331909, \"percentile_inc_nulls\": 0.1281457543373108, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 18,779 values (9.8%) are null and there are 3458 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 63256, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 5664, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"united kingdom\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 3132, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"netherlands\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"germany\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2801, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"cayman islands\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2799, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2576, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"china\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2524, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2345, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"australia\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"jersey islanddelaware\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"tanzania, united republic of\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"albany\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"calallen\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"private uk\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 63256]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "2a57f717-140f-434d-8998-983b8bf38ac5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed details,\n",
+       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 7107, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7107.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 7107, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7107.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7107, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7107, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 7107]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9983115196228027, \"percentile_inc_nulls\": 0.9983115196228027, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7095.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 7101 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"bill holdings, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"aclarion, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"augusta gold corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"anavex life sciences corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"inhibikase therapeutics, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"magenta therapeutics, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"waters corp /de/\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"core laboratories n v\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"optical cable corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"blonder tongue laboratories incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"waters corp /de/\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"core laboratories n v\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"optical cable corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"blonder tongue laboratories incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"novavax incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.41720396280288696, \"percentile_inc_nulls\": 0.4785422682762146, \"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 3706.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.31655919551849365, \"percentile_inc_nulls\": 0.3884902000427246, \"value_count\": 640, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.26576507091522217, \"percentile_inc_nulls\": 0.3430420756340027, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.23572885990142822, \"percentile_inc_nulls\": 0.3161671757698059, \"value_count\": 191, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.21811604499816895, \"percentile_inc_nulls\": 0.3004080653190613, \"value_count\": 112, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.20113223791122437, \"percentile_inc_nulls\": 0.2852117419242859, \"value_count\": 108, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.1857210397720337, \"percentile_inc_nulls\": 0.2714225649833679, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.17282593250274658, \"percentile_inc_nulls\": 0.2598845958709717, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.16071707010269165, \"percentile_inc_nulls\": 0.24905025959014893, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.14939457178115845, \"percentile_inc_nulls\": 0.23891937732696533, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.1390155553817749, \"percentile_inc_nulls\": 0.22963273525238037, \"value_count\": 66, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12989461421966553, \"percentile_inc_nulls\": 0.22147178649902344, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12218904495239258, \"percentile_inc_nulls\": 0.21457719802856445, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.11479794979095459, \"percentile_inc_nulls\": 0.2079640030860901, \"value_count\": 47, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10787862539291382, \"percentile_inc_nulls\": 0.20177292823791504, \"value_count\": 44, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10127377510070801, \"percentile_inc_nulls\": 0.1958632469177246, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08869320154190063, \"percentile_inc_nulls\": 0.18460673093795776, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08256012201309204, \"percentile_inc_nulls\": 0.17911916971206665, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07689887285232544, \"percentile_inc_nulls\": 0.17405372858047485, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07139486074447632, \"percentile_inc_nulls\": 0.16912901401519775, \"value_count\": 35, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.06683439016342163, \"percentile_inc_nulls\": 0.1650485396385193, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05834251642227173, \"percentile_inc_nulls\": 0.1574503779411316, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.054725587368011475, \"percentile_inc_nulls\": 0.15421414375305176, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05158042907714844, \"percentile_inc_nulls\": 0.15140002965927124, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.048592567443847656, \"percentile_inc_nulls\": 0.14872658252716064, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.043245792388916016, \"percentile_inc_nulls\": 0.14394259452819824, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 34.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.036169230937957764, \"percentile_inc_nulls\": 0.1376107931137085, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.03176599740982056, \"percentile_inc_nulls\": 0.13367104530334473, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 28.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.027677297592163086, \"percentile_inc_nulls\": 0.13001269102096558, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.024217665195465088, \"percentile_inc_nulls\": 0.12691712379455566, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.02107250690460205, \"percentile_inc_nulls\": 0.12410300970077515, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0196571946144104, \"percentile_inc_nulls\": 0.1228366494178772, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.01839911937713623, \"percentile_inc_nulls\": 0.12171101570129395, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 8.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.016197502613067627, \"percentile_inc_nulls\": 0.11974108219146729, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.01431041955947876, \"percentile_inc_nulls\": 0.11805260181427002, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.011165261268615723, \"percentile_inc_nulls\": 0.1152384877204895, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.007391095161437988, \"percentile_inc_nulls\": 0.11186152696609497, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.004088699817657471, \"percentile_inc_nulls\": 0.10890668630599976, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0015725493431091309, \"percentile_inc_nulls\": 0.10665541887283325, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.1052483320236206, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 3706.0, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 748 values (10.5%) are null and there are 81 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 640, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"nevada\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"maryland\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 191, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"e9\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 112, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 108, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"new york\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"north carolina\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"pennsylvania\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"t3\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"f4\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"c5\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"p8\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"a3\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3706]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 96,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f258250-97c1-4f19-b535-cb91ff9e0ea9",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Blocking\n",
+    "\n",
+    "Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. \n",
+    "\n",
+    "TODO: can we block on nearest 5 report years instead of exact match report year?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "fb6d143b-5201-4b31-849c-97db80781ade",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from splink import block_on\n",
+    "from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "id": "22766c9f-7371-483f-82b0-015549a84357",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "br = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'number_of_comparisons_generated_pre_filter_conditions': 2069828,\n",
+       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 2069828,\n",
+       " 'filter_conditions_identified': '',\n",
+       " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 3) = SUBSTRING(r.company_name_mphone, 1, 3)',\n",
+       " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# br0 = block_on(\"report_year\", \"report_year\")\n",
+    "# br1 = \"jaccard(l.company_name, r.company_name) < .1\"\n",
+    "# br2 = block_on(\"company_name\", \"company_name\")\n",
+    "\n",
+    "counts = count_comparisons_from_blocking_rule(\n",
+    "    table_or_tables=[sec_match_df, ex21_match_df],\n",
+    "    blocking_rule=br,\n",
+    "    link_type=\"link_only\",\n",
+    "    unique_id_column_name='record_id',\n",
+    "    db_api=db_api,\n",
+    ")\n",
+    "\n",
+    "counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "id": "67717313-2c17-4b6b-b984-8f7bc955c678",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>key_0</th>\n",
+       "      <th>key_1</th>\n",
+       "      <th>count_l</th>\n",
+       "      <th>count_r</th>\n",
+       "      <th>block_count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>STR</td>\n",
+       "      <td>68</td>\n",
+       "      <td>1297</td>\n",
+       "      <td>88196</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>INT</td>\n",
+       "      <td>62</td>\n",
+       "      <td>1275</td>\n",
+       "      <td>79050</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>KRN</td>\n",
+       "      <td>60</td>\n",
+       "      <td>1290</td>\n",
+       "      <td>77400</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   key_0 key_1  count_l  count_r  block_count\n",
+       "0   2023   STR       68     1297        88196\n",
+       "1   2023   INT       62     1275        79050\n",
+       "2   2023   KRN       60     1290        77400"
+      ]
+     },
+     "execution_count": 106,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = n_largest_blocks(\n",
+    "    table_or_tables=[sec_match_df, ex21_match_df],\n",
+    "    blocking_rule=br,\n",
+    "    link_type=\"link_only\",\n",
+    "    db_api=db_api,\n",
+    "    n_largest=3\n",
+    ")\n",
+    "\n",
+    "result.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed details,\n",
+       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-da90c619ab0a310af714d4034b6664f8\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-da90c619ab0a310af714d4034b6664f8\": [{\"blocking_rule\": \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\", \"row_count\": 2069828, \"cumulative_rows\": 2069828, \"cartesian\": 1365709548, \"match_key\": \"0\", \"start\": 0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.Chart(...)"
+      ]
+     },
+     "execution_count": 107,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from splink.blocking_analysis import (\n",
+    "    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n",
+    ")\n",
+    "\n",
+    "blocking_rules_for_analysis = [\n",
+    "    # block_on(\"substr(l.company_name_mphone,1,3)\", \"substr(r.company_name_mphone,1,3)\"),\n",
+    "    \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n",
+    "]\n",
+    "\n",
+    "\n",
+    "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+    "    table_or_tables=[sec_match_df, ex21_match_df],\n",
+    "    blocking_rules=blocking_rules_for_analysis,\n",
+    "    db_api=db_api,\n",
+    "    unique_id_column_name='record_id',\n",
+    "    link_type=\"link_only\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b553f3fb-0661-46ab-b43c-f5fcba608a09",
+   "metadata": {},
+   "source": [
+    "## Create Model\n",
+    "\n",
+    "Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import splink.comparison_library as cl\n",
+    "from splink import Linker, SettingsCreator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "bb13b160-b554-45d6-a575-5fa2de061350",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'NameComparison' of \"company_name\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
+      "    - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
+      "    - 'Jaro-Winkler distance of company_name >= 0.92' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.92\n",
+      "    - 'Jaro-Winkler distance of company_name >= 0.88' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.88\n",
+      "    - 'Jaro-Winkler distance of company_name >= 0.7' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.7\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "company_name_comparison = cl.NameComparison(\n",
+    "    \"company_name\",\n",
+    "    # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n",
+    ")\n",
+    "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n",
+      "    - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n",
+      "    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n",
+      "    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.7' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.7\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# try with Levenshtein too\n",
+    "location_comparison = cl.JaroWinklerAtThresholds(\n",
+    "    \"loc_of_incorporation\",\n",
+    ")\n",
+    "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "settings = SettingsCreator(\n",
+    "    link_type=\"link_only\",\n",
+    "    unique_id_column_name=\"record_id\",\n",
+    "    comparisons=[\n",
+    "        company_name_comparison,\n",
+    "        location_comparison.configure(term_frequency_adjustments=True)\n",
+    "    ],\n",
+    "    blocking_rules_to_generate_predictions=[\n",
+    "        br\n",
+    "    ],\n",
+    "    retain_intermediate_calculation_columns=True,\n",
+    ")\n",
+    "\n",
+    "linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f293657-b40c-4539-8abd-8524d11c39c0",
+   "metadata": {},
+   "source": [
+    "Estimate probability two random records match"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f8061ccbd73c426daa2d35dbf68e55fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Probability two random records match is estimated to be  0.000689.\n",
+      "This means that amongst all possible pairwise record comparisons, one in 1,452.36 are expected to match.  With 1,365,709,548 total possible comparisons, we expect a total of around 940,336.47 matching pairs\n"
+     ]
+    }
+   ],
+   "source": [
+    "deterministic_rules = [\n",
+    "    block_on(\"company_name_mphone\", \"company_name_mphone\"),\n",
+    "    \"jaccard(r.company_name, l.company_name) >= .9 and l.loc_of_incorporation = r.loc_of_incorporation\",\n",
+    "    \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .8\",\n",
+    "    # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n",
+    "]\n",
+    "\n",
+    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.85)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "id": "5117653e-e72b-4c13-b923-d1228b39d357",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "----- Estimating u probabilities using random sampling -----\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e732ac0702e4459b82b86d2de5c9d9fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Estimated u probabilities using random sampling\n",
+      "\n",
+      "Your model is not yet fully trained. Missing estimates for:\n",
+      "    - company_name (no m values are trained).\n",
+      "    - loc_of_incorporation (no m values are trained).\n"
+     ]
+    }
+   ],
+   "source": [
+    "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "----- Starting EM training session -----\n",
+      "\n",
+      "Estimating the m probabilities of the model by blocking on:\n",
+      "(l.\"company_name_mphone\" = r.\"company_name_mphone\") AND (l.\"company_name_mphone\" = r.\"company_name_mphone\")\n",
+      "\n",
+      "Parameter estimates will be made for the following comparison(s):\n",
+      "    - company_name\n",
+      "    - loc_of_incorporation\n",
+      "\n",
+      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+      "\n",
+      "Iteration 1: Largest change in params was -0.213 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n",
+      "Iteration 2: Largest change in params was 0.243 in the m_probability of loc_of_incorporation, level `All other comparisons`\n",
+      "Iteration 3: Largest change in params was 0.0314 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.88`\n",
+      "Iteration 4: Largest change in params was 0.0052 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 5: Largest change in params was 0.0087 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 6: Largest change in params was 0.0133 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 7: Largest change in params was 0.0188 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 8: Largest change in params was 0.0246 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 9: Largest change in params was 0.0297 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 10: Largest change in params was 0.0332 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 11: Largest change in params was 0.0346 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 12: Largest change in params was 0.0336 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 13: Largest change in params was 0.0306 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 14: Largest change in params was 0.0264 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 15: Largest change in params was 0.0218 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 16: Largest change in params was 0.0173 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 17: Largest change in params was 0.0134 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 18: Largest change in params was 0.0102 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 19: Largest change in params was 0.00758 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 20: Largest change in params was 0.00559 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 21: Largest change in params was 0.00409 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 22: Largest change in params was 0.00298 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 23: Largest change in params was 0.00216 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 24: Largest change in params was 0.00156 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 25: Largest change in params was 0.00112 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "\n",
+      "EM converged after 25 iterations\n",
+      "\n",
+      "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_blocking_rule = block_on(\"company_name_mphone\", \"company_name_mphone\")\n",
+    "training_session_fname_sname = (\n",
+    "    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed details,\n",
+       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-3201e7b556c247e9865c46b1acc2ded5\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-3201e7b556c247e9865c46b1acc2ded5\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-3201e7b556c247e9865c46b1acc2ded5\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-fc45ce83a28220af2a936ce680a9dadd\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-fc45ce83a28220af2a936ce680a9dadd\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 0.0006890076817709412, \"log2_bayes_factor\": -10.503192311872333, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.001 or one in  1,452.4 records.This is equivalent to a starting match weight of -10.503.\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.3720141993657098, \"u_probability\": 5.346727241521828e-07, \"m_probability_description\": \"Amongst matching record comparisons, 37.2% of records (i.e. one in 2.688) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.347e-05% of records (i.e. one in 1,870,303) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 695779.2731162851, \"log2_bayes_factor\": 19.40827017693912, \"comparison_vector_value\": 4, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 695,779 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.92\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.92\", \"m_probability\": 0.19070155304725356, \"u_probability\": 5.4030085809062685e-06, \"m_probability_description\": \"Amongst matching record comparisons, 19.07% of records (i.e. one in 5.244) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005403% of records (i.e. one in 185,082) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 35295.437753176106, \"log2_bayes_factor\": 15.107194094030717, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.92` then comparison is 35,295 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.88\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.88\", \"m_probability\": 0.08481404998298808, \"u_probability\": 3.987532895387595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 8.481% of records (i.e. one in 11.79) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.003988% of records (i.e. one in 25,078) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2126.9805718993075, \"log2_bayes_factor\": 11.054591140267897, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.88` then comparison is 2,127 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.7\", \"m_probability\": 0.352353073149661, \"u_probability\": 0.028111065707703935, \"m_probability_description\": \"Amongst matching record comparisons, 35.24% of records (i.e. one in 2.838) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.811% of records (i.e. one in 35.57) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 12.534319289542177, \"log2_bayes_factor\": 3.6478117436904105, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.7` then comparison is 12.53 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.00011712445438753835, \"u_probability\": 0.9718431212820371, \"m_probability_description\": \"Amongst matching record comparisons, 0.01171% of records (i.e. one in 8,538) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 97.18% of records (i.e. one in 1.029) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.0001205178611883675, \"log2_bayes_factor\": -13.01846540456049, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 8,298 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.47771795453863003, \"u_probability\": 0.20575506730342918, \"m_probability_description\": \"Amongst matching record comparisons, 47.77% of records (i.e. one in 2.093) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 20.58% of records (i.e. one in 4.86) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.3217797782551535, \"log2_bayes_factor\": 1.2152311384884475, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.322 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.027967056159476814, \"u_probability\": 0.006199367500196429, \"m_probability_description\": \"Amongst matching record comparisons, 2.797% of records (i.e. one in 35.76) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6199% of records (i.e. one in 161) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.511275732337318, \"log2_bayes_factor\": 2.1735354672850598, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.511 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.7\", \"m_probability\": 0.014133426902826658, \"u_probability\": 0.0063969929142549085, \"m_probability_description\": \"Amongst matching record comparisons, 1.413% of records (i.e. one in 70.75) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6397% of records (i.e. one in 156) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.209386049393937, \"log2_bayes_factor\": 1.1436455250240343, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.7` then comparison is 2.209 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4801815623990666, \"u_probability\": 0.7816485722821195, \"m_probability_description\": \"Amongst matching record comparisons, 48.02% of records (i.e. one in 2.083) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 78.16% of records (i.e. one in 1.279) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.614318991202296, \"log2_bayes_factor\": -0.7029401110811061, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.628 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 115,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "linker.visualisations.match_weights_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "id": "673a4776-1de1-46ce-a411-f7fd1668d54f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed details,\n",
+       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-7861bebb26e3480992f35df62a1446ee\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-7861bebb26e3480992f35df62a1446ee\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7861bebb26e3480992f35df62a1446ee\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-b8247f1f2757a60a3093f064d0fd8cf0\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-b8247f1f2757a60a3093f064d0fd8cf0\": [{\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.3720141993657098, \"u_probability\": 5.346727241521828e-07, \"m_probability_description\": \"Amongst matching record comparisons, 37.2% of records (i.e. one in 2.688) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.347e-05% of records (i.e. one in 1,870,303) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 695779.2731162851, \"log2_bayes_factor\": 19.40827017693912, \"comparison_vector_value\": 4, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 695,779 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.92\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.92\", \"m_probability\": 0.19070155304725356, \"u_probability\": 5.4030085809062685e-06, \"m_probability_description\": \"Amongst matching record comparisons, 19.07% of records (i.e. one in 5.244) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005403% of records (i.e. one in 185,082) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 35295.437753176106, \"log2_bayes_factor\": 15.107194094030717, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.92` then comparison is 35,295 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.88\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.88\", \"m_probability\": 0.08481404998298808, \"u_probability\": 3.987532895387595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 8.481% of records (i.e. one in 11.79) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.003988% of records (i.e. one in 25,078) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2126.9805718993075, \"log2_bayes_factor\": 11.054591140267897, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.88` then comparison is 2,127 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.7\", \"m_probability\": 0.352353073149661, \"u_probability\": 0.028111065707703935, \"m_probability_description\": \"Amongst matching record comparisons, 35.24% of records (i.e. one in 2.838) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.811% of records (i.e. one in 35.57) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 12.534319289542177, \"log2_bayes_factor\": 3.6478117436904105, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.7` then comparison is 12.53 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.00011712445438753835, \"u_probability\": 0.9718431212820371, \"m_probability_description\": \"Amongst matching record comparisons, 0.01171% of records (i.e. one in 8,538) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 97.18% of records (i.e. one in 1.029) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.0001205178611883675, \"log2_bayes_factor\": -13.01846540456049, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 8,298 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.47771795453863003, \"u_probability\": 0.20575506730342918, \"m_probability_description\": \"Amongst matching record comparisons, 47.77% of records (i.e. one in 2.093) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 20.58% of records (i.e. one in 4.86) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.3217797782551535, \"log2_bayes_factor\": 1.2152311384884475, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.322 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.027967056159476814, \"u_probability\": 0.006199367500196429, \"m_probability_description\": \"Amongst matching record comparisons, 2.797% of records (i.e. one in 35.76) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6199% of records (i.e. one in 161) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.511275732337318, \"log2_bayes_factor\": 2.1735354672850598, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.511 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.7\", \"m_probability\": 0.014133426902826658, \"u_probability\": 0.0063969929142549085, \"m_probability_description\": \"Amongst matching record comparisons, 1.413% of records (i.e. one in 70.75) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6397% of records (i.e. one in 156) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.209386049393937, \"log2_bayes_factor\": 1.1436455250240343, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.7` then comparison is 2.209 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4801815623990666, \"u_probability\": 0.7816485722821195, \"m_probability_description\": \"Amongst matching record comparisons, 48.02% of records (i.e. one in 2.083) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 78.16% of records (i.e. one in 1.279) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.614318991202296, \"log2_bayes_factor\": -0.7029401110811061, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.628 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.HConcatChart(...)"
+      ]
+     },
+     "execution_count": 116,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "linker.visualisations.m_u_parameters_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebf9e326-38f1-4d78-b302-15867cda1009",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "settings = linker.misc.save_model_to_json(\n",
+    "    \"../sec_ex21_model_settings/2023_model.json\", overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a14055d2-6761-4906-8555-35c92553a0e9",
+   "metadata": {},
+   "source": [
+    "Log model in MLFlow."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfe4feca-e694-4ec6-a5b0-11382c740516",
+   "metadata": {},
+   "source": [
+    "## Make predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 120,
+   "id": "72ff6575-68e3-4256-8253-85eb2564501f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Blocking time: 0.37 seconds\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d550d84b328c4d3082bd7cf5d03b803b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Predict time: 78.84 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_predictions = linker.inference.predict(threshold_match_probability=0.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "id": "24e14675-11cf-4c46-a592-7733326113d2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "preds_df = df_predictions.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_l</th>\n",
+       "      <th>company_name_r</th>\n",
+       "      <th>gamma_company_name</th>\n",
+       "      <th>tf_company_name_l</th>\n",
+       "      <th>tf_company_name_r</th>\n",
+       "      <th>bf_company_name</th>\n",
+       "      <th>bf_tf_adj_company_name</th>\n",
+       "      <th>loc_of_incorporation_l</th>\n",
+       "      <th>loc_of_incorporation_r</th>\n",
+       "      <th>gamma_loc_of_incorporation</th>\n",
+       "      <th>tf_loc_of_incorporation_l</th>\n",
+       "      <th>tf_loc_of_incorporation_r</th>\n",
+       "      <th>bf_loc_of_incorporation</th>\n",
+       "      <th>bf_tf_adj_loc_of_incorporation</th>\n",
+       "      <th>report_year_l</th>\n",
+       "      <th>report_year_r</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>5374</th>\n",
+       "      <td>0.008914</td>\n",
+       "      <td>0.501545</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6916</td>\n",
+       "      <td>7681</td>\n",
+       "      <td>manitowoc co incorporated</td>\n",
+       "      <td>manitowoc crane companies, llc mcg</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>12.534319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>50.180785</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>MNTWK K INKRPRTT</td>\n",
+       "      <td>MNTWK KRN KMPNS LK MKK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1452</th>\n",
+       "      <td>0.008914</td>\n",
+       "      <td>0.501545</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>3995</td>\n",
+       "      <td>1003</td>\n",
+       "      <td>schneider national, incorporated</td>\n",
+       "      <td>33.schneider logistics, incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>12.534319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>50.180785</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>SXNTR NXNL INKRPRTT</td>\n",
+       "      <td>SXNTR LJSTKS INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4185</th>\n",
+       "      <td>0.008914</td>\n",
+       "      <td>0.501545</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>485</td>\n",
+       "      <td>6819</td>\n",
+       "      <td>wisconsin electric power company</td>\n",
+       "      <td>wisconsin energy capital corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>12.534319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>50.180785</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>WSKNSN ELKTRK PWR KMPN</td>\n",
+       "      <td>WSKNSN ENRJ KPTL KRPRXN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3907</th>\n",
+       "      <td>0.008914</td>\n",
+       "      <td>0.501545</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1836</td>\n",
+       "      <td>1390</td>\n",
+       "      <td>orion energy systems, incorporated</td>\n",
+       "      <td>wilson funeral home, incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>12.534319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>50.180785</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>ORN ENRJ SSTMS INKRPRTT</td>\n",
+       "      <td>WLSN FNRL HM INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1426</th>\n",
+       "      <td>0.008914</td>\n",
+       "      <td>0.501545</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>3995</td>\n",
+       "      <td>1010</td>\n",
+       "      <td>schneider national, incorporated</td>\n",
+       "      <td>40.schneider resources, incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>12.534319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>0.004100</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>50.180785</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>SXNTR NXNL INKRPRTT</td>\n",
+       "      <td>SXNTR RSRSS INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4672</th>\n",
+       "      <td>13.232266</td>\n",
+       "      <td>0.999896</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6568</td>\n",
+       "      <td>4608</td>\n",
+       "      <td>wesbanco incorporated</td>\n",
+       "      <td>wesbanco, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>west virginia</td>\n",
+       "      <td>west virginia</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.001207</td>\n",
+       "      <td>0.001207</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>170.429672</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>WSBNK INKRPRTT</td>\n",
+       "      <td>WSBNK INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1829</th>\n",
+       "      <td>13.257062</td>\n",
+       "      <td>0.999898</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>497</td>\n",
+       "      <td>4974</td>\n",
+       "      <td>berkshire hathaway energy company</td>\n",
+       "      <td>berkshire hathaway energy company</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>695779.273116</td>\n",
+       "      <td>0.053272</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.001246</td>\n",
+       "      <td>0.001246</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>165.103745</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>BRKXR H0W ENRJ KMPN</td>\n",
+       "      <td>BRKXR H0W ENRJ KMPN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6458</th>\n",
+       "      <td>13.550873</td>\n",
+       "      <td>0.999917</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>3842</td>\n",
+       "      <td>749</td>\n",
+       "      <td>shiftpixy, incorporated</td>\n",
+       "      <td>shiftpixy labs, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>0.000005</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000968</td>\n",
+       "      <td>0.000968</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>212.547350</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>XFTPKS INKRPRTT</td>\n",
+       "      <td>XFTPKS LBS INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1330</th>\n",
+       "      <td>13.621474</td>\n",
+       "      <td>0.999921</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4088</td>\n",
+       "      <td>476</td>\n",
+       "      <td>securetech innovations, incorporated</td>\n",
+       "      <td>securetech innovations, incorporated</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>695779.273116</td>\n",
+       "      <td>0.053272</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000968</td>\n",
+       "      <td>0.000968</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>212.547350</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>SKRTX INFXNS INKRPRTT</td>\n",
+       "      <td>SKRTX INFXNS INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6186</th>\n",
+       "      <td>14.206436</td>\n",
+       "      <td>0.999947</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8116</td>\n",
+       "      <td>2004</td>\n",
+       "      <td>southwestern public service company</td>\n",
+       "      <td>southwestern public service company</td>\n",
+       "      <td>4</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>695779.273116</td>\n",
+       "      <td>0.053272</td>\n",
+       "      <td>new mexico</td>\n",
+       "      <td>new mexico</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000645</td>\n",
+       "      <td>0.000645</td>\n",
+       "      <td>2.32178</td>\n",
+       "      <td>318.821024</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>S0WSTRN PBLK SRFS KMPN</td>\n",
+       "      <td>S0WSTRN PBLK SRFS KMPN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>7540 rows × 24 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                        company_name_l                        company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation  report_year_l  report_year_r    company_name_mphone_l    company_name_mphone_r\n",
+       "5374      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         6916         7681             manitowoc co incorporated    manitowoc crane companies, llc mcg                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023         MNTWK K INKRPRTT   MNTWK KRN KMPNS LK MKK\n",
+       "1452      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         3995         1003      schneider national, incorporated  33.schneider logistics, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023      SXNTR NXNL INKRPRTT    SXNTR LJSTKS INKRPRTT\n",
+       "4185      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1          485         6819      wisconsin electric power company  wisconsin energy capital corporation                   1           0.000010           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023   WSKNSN ELKTRK PWR KMPN  WSKNSN ENRJ KPTL KRPRXN\n",
+       "3907      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         1836         1390    orion energy systems, incorporated     wilson funeral home, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023  ORN ENRJ SSTMS INKRPRTT    WLSN FNRL HM INKRPRTT\n",
+       "1426      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         3995         1010      schneider national, incorporated  40.schneider resources, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023      SXNTR NXNL INKRPRTT     SXNTR RSRSS INKRPRTT\n",
+       "...            ...                ...                      ...                      ...          ...          ...                                   ...                                   ...                 ...                ...                ...              ...                     ...                    ...                    ...                         ...                        ...                        ...                      ...                             ...            ...            ...                      ...                      ...\n",
+       "4672     13.232266           0.999896  __splink__input_table_0  __splink__input_table_1         6568         4608                 wesbanco incorporated                wesbanco, incorporated                   3           0.000005           0.000005     35295.437753                1.000000          west virginia          west virginia                           3                   0.001207                   0.001207                  2.32178                      170.429672           2023           2023           WSBNK INKRPRTT           WSBNK INKRPRTT\n",
+       "1829     13.257062           0.999898  __splink__input_table_0  __splink__input_table_1          497         4974     berkshire hathaway energy company     berkshire hathaway energy company                   4           0.000010           0.000010    695779.273116                0.053272                   iowa                   iowa                           3                   0.001246                   0.001246                  2.32178                      165.103745           2023           2023      BRKXR H0W ENRJ KMPN      BRKXR H0W ENRJ KMPN\n",
+       "6458     13.550873           0.999917  __splink__input_table_0  __splink__input_table_1         3842          749               shiftpixy, incorporated          shiftpixy labs, incorporated                   3           0.000005           0.000005     35295.437753                1.000000                wyoming                wyoming                           3                   0.000968                   0.000968                  2.32178                      212.547350           2023           2023          XFTPKS INKRPRTT      XFTPKS LBS INKRPRTT\n",
+       "1330     13.621474           0.999921  __splink__input_table_0  __splink__input_table_1         4088          476  securetech innovations, incorporated  securetech innovations, incorporated                   4           0.000010           0.000010    695779.273116                0.053272                wyoming                wyoming                           3                   0.000968                   0.000968                  2.32178                      212.547350           2023           2023    SKRTX INFXNS INKRPRTT    SKRTX INFXNS INKRPRTT\n",
+       "6186     14.206436           0.999947  __splink__input_table_0  __splink__input_table_1         8116         2004   southwestern public service company   southwestern public service company                   4           0.000010           0.000010    695779.273116                0.053272             new mexico             new mexico                           3                   0.000645                   0.000645                  2.32178                      318.821024           2023           2023   S0WSTRN PBLK SRFS KMPN   S0WSTRN PBLK SRFS KMPN\n",
+       "\n",
+       "[7540 rows x 24 columns]"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df.sort_values(by=\"match_probability\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "id": "255272b6-a5c4-4ab8-bebc-d13e77655938",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['match_weight', 'match_probability', 'source_dataset_l',\n",
+       "       'source_dataset_r', 'record_id_l', 'record_id_r', 'company_name_l',\n",
+       "       'company_name_r', 'gamma_company_name', 'tf_company_name_l',\n",
+       "       'tf_company_name_r', 'bf_company_name', 'bf_tf_adj_company_name',\n",
+       "       'loc_of_incorporation_l', 'loc_of_incorporation_r',\n",
+       "       'gamma_loc_of_incorporation', 'tf_loc_of_incorporation_l',\n",
+       "       'tf_loc_of_incorporation_r', 'bf_loc_of_incorporation',\n",
+       "       'bf_tf_adj_loc_of_incorporation', 'company_name_mphone_l',\n",
+       "       'company_name_mphone_r', 'report_year_l', 'report_year_r'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 238,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 249,
+   "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>company_name_l</th>\n",
+       "      <th>company_name_r</th>\n",
+       "      <th>loc_of_incorporation_l</th>\n",
+       "      <th>loc_of_incorporation_r</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>150</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>santander drive auto receivables trust 2018-1</td>\n",
+       "      <td>santander drive auto receivables trust</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>santander drive auto receivables trust 2018-5</td>\n",
+       "      <td>santander drive auto receivables trust</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>152</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>santander drive auto receivables trust 2018-3</td>\n",
+       "      <td>santander drive auto receivables trust</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>153</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>santander drive auto receivables trust 2016-1</td>\n",
+       "      <td>santander drive auto receivables trust</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>154</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>constellation pharmaceuticals inc</td>\n",
+       "      <td>constellation connect, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KNSTLXN FRMSTKLS INK</td>\n",
+       "      <td>KNSTLXN KNKT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>162</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>illinois</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>163</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>missouri</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>164</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>maine</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>165</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>kansas</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>166</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>minnesota</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>167</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>central</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>168</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>169</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>consolidated communications holdings, inc.</td>\n",
+       "      <td>consolidated communications of</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>california</td>\n",
+       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
+       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>174</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy one, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ ON INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>177</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>verus international, inc.</td>\n",
+       "      <td>emcor international, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>FRS INTRNXNL INK</td>\n",
+       "      <td>EMKR INTRNXNL INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>178</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>verus international, inc.</td>\n",
+       "      <td>emcor international, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>FRS INTRNXNL INK</td>\n",
+       "      <td>EMKR INTRNXNL INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>179</th>\n",
+       "      <td>0.714594</td>\n",
+       "      <td>green plains inc.</td>\n",
+       "      <td>green plains superior llc fka superior</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>KRN PLNS INK</td>\n",
+       "      <td>KRN PLNS SPRR LK FK SPRR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>183</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy group, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ KRP LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>195</th>\n",
+       "      <td>0.884993</td>\n",
+       "      <td>green stream holdings inc.</td>\n",
+       "      <td>western gas wyoming, l.l.c</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>KRN STRM HLTNKS INK</td>\n",
+       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>196</th>\n",
+       "      <td>0.884993</td>\n",
+       "      <td>green stream holdings inc.</td>\n",
+       "      <td>western gas wyoming, l.l.c</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>KRN STRM HLTNKS INK</td>\n",
+       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>197</th>\n",
+       "      <td>0.992184</td>\n",
+       "      <td>fortress biotech, inc.</td>\n",
+       "      <td>fortress biotech, china, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>None</td>\n",
+       "      <td>FRTRS BTX INK</td>\n",
+       "      <td>FRTRS BTX XN INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>199</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy china corp</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ XN KRP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>200</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy corporate services, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ KRPRT SRFSS INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>203</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>apollo global management, inc.</td>\n",
+       "      <td>apollo belenos management llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>APL KLBL MNJMNT INK</td>\n",
+       "      <td>APL BLNS MNJMNT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>204</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>apollo global management, inc.</td>\n",
+       "      <td>apollo belenos management llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>APL KLBL MNJMNT INK</td>\n",
+       "      <td>APL BLNS MNJMNT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>206</th>\n",
+       "      <td>0.981099</td>\n",
+       "      <td>columbia property trust, inc.</td>\n",
+       "      <td>columbia courtyard, inc</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>KLMB PRPRT TRST INK</td>\n",
+       "      <td>KLMB KRTYRT INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>208</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy beckjord, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ BKJRT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>209</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy beckjord storage llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ BKJRT STRJ LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>210</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy acp, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ AKP LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>213</th>\n",
+       "      <td>0.981099</td>\n",
+       "      <td>spirit realty capital, inc.</td>\n",
+       "      <td>spirit reit, inc</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>SPRT RLT KPTL INK</td>\n",
+       "      <td>SPRT RT INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>215</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>apollo global management, inc.</td>\n",
+       "      <td>apollo na management ii, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>APL KLBL MNJMNT INK</td>\n",
+       "      <td>APL N MNJMNT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>216</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>apollo global management, inc.</td>\n",
+       "      <td>apollo na management ii, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>APL KLBL MNJMNT INK</td>\n",
+       "      <td>APL N MNJMNT LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>225</th>\n",
+       "      <td>0.992184</td>\n",
+       "      <td>fortress biotech, inc.</td>\n",
+       "      <td>fortress biotech, china, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>None</td>\n",
+       "      <td>FRTRS BTX INK</td>\n",
+       "      <td>FRTRS BTX XN INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>226</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>green brick partners, inc.</td>\n",
+       "      <td>green brick mortgage, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KRN BRK PRTNRS INK</td>\n",
+       "      <td>KRN BRK MRTKJ LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>227</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy beckjord storage llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ BKJRT STRJ LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>228</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>green plains inc.</td>\n",
+       "      <td>green plains madison llc</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KRN PLNS INK</td>\n",
+       "      <td>KRN PLNS MTSN LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>242</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>great lakes dredge &amp; dock corp</td>\n",
+       "      <td>great lakes dredge &amp; dock do brasil ltda</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>brazil</td>\n",
+       "      <td>KRT LKS TRJ TK KRP</td>\n",
+       "      <td>KRT LKS TRJ TK T BRSL LTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>243</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>great lakes dredge &amp; dock corp</td>\n",
+       "      <td>great lakes dredge &amp; dock environmental, inc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KRT LKS TRJ TK KRP</td>\n",
+       "      <td>KRT LKS TRJ TK ENFRNMNTL INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>244</th>\n",
+       "      <td>0.996128</td>\n",
+       "      <td>great lakes dredge &amp; dock corp</td>\n",
+       "      <td>great lakes dredge &amp; dock company, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KRT LKS TRJ TK KRP</td>\n",
+       "      <td>KRT LKS TRJ TK KMPN LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>251</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>blackstone group inc</td>\n",
+       "      <td>blackstone pb ii l.l.c</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>BLKSTN KRP INK</td>\n",
+       "      <td>BLKSTN PB LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>252</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>blackstone group inc</td>\n",
+       "      <td>blackstone pb i l.l.c</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>BLKSTN KRP INK</td>\n",
+       "      <td>BLKSTN PB I LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>254</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy acp, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ AKP LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>255</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy shoreham, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ XRHM LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>256</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>duke energy sam, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>TK ENRJ KRP</td>\n",
+       "      <td>TK ENRJ SM LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>257</th>\n",
+       "      <td>0.573277</td>\n",
+       "      <td>blackstone group inc</td>\n",
+       "      <td>blackstone obs l.l.c</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>BLKSTN KRP INK</td>\n",
+       "      <td>BLKSTN OBS LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>264</th>\n",
+       "      <td>0.992184</td>\n",
+       "      <td>freightcar america, inc.</td>\n",
+       "      <td>freightcar america leasing, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>None</td>\n",
+       "      <td>FRTKR AMRK INK</td>\n",
+       "      <td>FRTKR AMRK LSNK LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>265</th>\n",
+       "      <td>0.992184</td>\n",
+       "      <td>freightcar america, inc.</td>\n",
+       "      <td>freightcar america leasing, llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>None</td>\n",
+       "      <td>FRTKR AMRK INK</td>\n",
+       "      <td>FRTKR AMRK LSNK LK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>266</th>\n",
+       "      <td>0.959568</td>\n",
+       "      <td>qurate retail, inc.</td>\n",
+       "      <td>qurate retail group, inc</td>\n",
+       "      <td>englewood</td>\n",
+       "      <td>de</td>\n",
+       "      <td>KRT RTL INK</td>\n",
+       "      <td>KRT RTL KRP INK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>267</th>\n",
+       "      <td>0.884993</td>\n",
+       "      <td>green stream holdings inc.</td>\n",
+       "      <td>western gas wyoming, l.l.c</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>KRN STRM HLTNKS INK</td>\n",
+       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>268</th>\n",
+       "      <td>0.884993</td>\n",
+       "      <td>green stream holdings inc.</td>\n",
+       "      <td>western gas wyoming, l.l.c</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>wyoming</td>\n",
+       "      <td>KRN STRM HLTNKS INK</td>\n",
+       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     match_probability                                 company_name_l                                company_name_r loc_of_incorporation_l loc_of_incorporation_r       company_name_mphone_l         company_name_mphone_r\n",
+       "150           0.996128  santander drive auto receivables trust 2018-1        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
+       "151           0.996128  santander drive auto receivables trust 2018-5        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
+       "152           0.996128  santander drive auto receivables trust 2018-3        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
+       "153           0.996128  santander drive auto receivables trust 2016-1        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
+       "154           0.573277              constellation pharmaceuticals inc                    constellation connect, llc               delaware               delaware        KNSTLXN FRMSTKLS INK               KNSTLXN KNKT LK\n",
+       "162           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware               illinois  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "163           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware               missouri  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "164           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                  maine  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "165           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                 kansas  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "166           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware              minnesota  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "167           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                central  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "168           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                florida  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "169           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware             california  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
+       "174           0.573277                               duke energy corp                          duke energy one, inc               delaware               delaware                 TK ENRJ KRP                TK ENRJ ON INK\n",
+       "177           0.573277                      verus international, inc.                      emcor international, inc               delaware               delaware            FRS INTRNXNL INK             EMKR INTRNXNL INK\n",
+       "178           0.573277                      verus international, inc.                      emcor international, inc               delaware               delaware            FRS INTRNXNL INK             EMKR INTRNXNL INK\n",
+       "179           0.714594                              green plains inc.        green plains superior llc fka superior                   iowa                   iowa                KRN PLNS INK      KRN PLNS SPRR LK FK SPRR\n",
+       "183           0.996128                               duke energy corp                        duke energy group, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ KRP LK\n",
+       "195           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
+       "196           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
+       "197           0.992184                         fortress biotech, inc.                  fortress biotech, china, inc               delaware                   None               FRTRS BTX INK              FRTRS BTX XN INK\n",
+       "199           0.996128                               duke energy corp                        duke energy china corp               delaware               delaware                 TK ENRJ KRP                TK ENRJ XN KRP\n",
+       "200           0.573277                               duke energy corp           duke energy corporate services, inc               delaware               delaware                 TK ENRJ KRP       TK ENRJ KRPRT SRFSS INK\n",
+       "203           0.573277                 apollo global management, inc.                 apollo belenos management llc               delaware               delaware         APL KLBL MNJMNT INK            APL BLNS MNJMNT LK\n",
+       "204           0.573277                 apollo global management, inc.                 apollo belenos management llc               delaware               delaware         APL KLBL MNJMNT INK            APL BLNS MNJMNT LK\n",
+       "206           0.981099                  columbia property trust, inc.                       columbia courtyard, inc               maryland               maryland         KLMB PRPRT TRST INK               KLMB KRTYRT INK\n",
+       "208           0.573277                               duke energy corp                     duke energy beckjord, llc               delaware               delaware                 TK ENRJ KRP              TK ENRJ BKJRT LK\n",
+       "209           0.573277                               duke energy corp              duke energy beckjord storage llc               delaware               delaware                 TK ENRJ KRP         TK ENRJ BKJRT STRJ LK\n",
+       "210           0.573277                               duke energy corp                          duke energy acp, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ AKP LK\n",
+       "213           0.981099                    spirit realty capital, inc.                              spirit reit, inc               maryland               maryland           SPRT RLT KPTL INK                   SPRT RT INK\n",
+       "215           0.573277                 apollo global management, inc.                  apollo na management ii, llc               delaware               delaware         APL KLBL MNJMNT INK               APL N MNJMNT LK\n",
+       "216           0.573277                 apollo global management, inc.                  apollo na management ii, llc               delaware               delaware         APL KLBL MNJMNT INK               APL N MNJMNT LK\n",
+       "225           0.992184                         fortress biotech, inc.                  fortress biotech, china, inc               delaware                   None               FRTRS BTX INK              FRTRS BTX XN INK\n",
+       "226           0.573277                     green brick partners, inc.                     green brick mortgage, llc               delaware               delaware          KRN BRK PRTNRS INK              KRN BRK MRTKJ LK\n",
+       "227           0.573277                               duke energy corp              duke energy beckjord storage llc               delaware               delaware                 TK ENRJ KRP         TK ENRJ BKJRT STRJ LK\n",
+       "228           0.959568                              green plains inc.                      green plains madison llc                   iowa               delaware                KRN PLNS INK              KRN PLNS MTSN LK\n",
+       "242           0.959568                 great lakes dredge & dock corp      great lakes dredge & dock do brasil ltda               delaware                 brazil          KRT LKS TRJ TK KRP     KRT LKS TRJ TK T BRSL LTT\n",
+       "243           0.573277                 great lakes dredge & dock corp  great lakes dredge & dock environmental, inc               delaware               delaware          KRT LKS TRJ TK KRP  KRT LKS TRJ TK ENFRNMNTL INK\n",
+       "244           0.996128                 great lakes dredge & dock corp        great lakes dredge & dock company, llc               delaware               delaware          KRT LKS TRJ TK KRP        KRT LKS TRJ TK KMPN LK\n",
+       "251           0.573277                           blackstone group inc                        blackstone pb ii l.l.c               delaware               delaware              BLKSTN KRP INK                 BLKSTN PB LLK\n",
+       "252           0.573277                           blackstone group inc                         blackstone pb i l.l.c               delaware               delaware              BLKSTN KRP INK               BLKSTN PB I LLK\n",
+       "254           0.573277                               duke energy corp                          duke energy acp, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ AKP LK\n",
+       "255           0.573277                               duke energy corp                     duke energy shoreham, llc               delaware               delaware                 TK ENRJ KRP               TK ENRJ XRHM LK\n",
+       "256           0.573277                               duke energy corp                          duke energy sam, llc               delaware               delaware                 TK ENRJ KRP                 TK ENRJ SM LK\n",
+       "257           0.573277                           blackstone group inc                          blackstone obs l.l.c               delaware               delaware              BLKSTN KRP INK                BLKSTN OBS LLK\n",
+       "264           0.992184                       freightcar america, inc.               freightcar america leasing, llc               delaware                   None              FRTKR AMRK INK            FRTKR AMRK LSNK LK\n",
+       "265           0.992184                       freightcar america, inc.               freightcar america leasing, llc               delaware                   None              FRTKR AMRK INK            FRTKR AMRK LSNK LK\n",
+       "266           0.959568                            qurate retail, inc.                      qurate retail group, inc              englewood                     de                 KRT RTL INK               KRT RTL KRP INK\n",
+       "267           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
+       "268           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK"
+      ]
+     },
+     "execution_count": 249,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df[preds_df.match_probability >= .5][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_of_incorporation_l\", \"loc_of_incorporation_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mozilla_sec_eia",
+   "language": "python",
+   "name": "mozilla_sec_eia"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/17-kl-paragraph-layout-metrics.ipynb b/notebooks/17-kl-paragraph-layout-metrics.ipynb
new file mode 100644
index 0000000..f7c3a8d
--- /dev/null
+++ b/notebooks/17-kl-paragraph-layout-metrics.ipynb
@@ -0,0 +1,687 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "748b07d1-61ac-43b8-bff9-9f660626da1b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bb513a3e-31f7-49da-895b-e3ed4f52efd4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "29c9b2e0-7f2f-4ab7-9972-f1ed30ff196a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "archive = GCSArchive()\n",
+    "md = archive.get_metadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1608bf1e-d6cf-4e3a-8f69-0e62744d0dfd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>cik</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>date_filed</th>\n",
+       "      <th>exhibit_21_version</th>\n",
+       "      <th>year_quarter</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>filename</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/17206/0000017206-94-000007.txt</th>\n",
+       "      <td>17206</td>\n",
+       "      <td>CAPITAL HOLDING CORP</td>\n",
+       "      <td>10-K/A</td>\n",
+       "      <td>1993-12-22</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1993q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/29082/0000950131-94-000021.txt</th>\n",
+       "      <td>29082</td>\n",
+       "      <td>DISNEY WALT CO</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>1993-12-22</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1993q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/32377/0000032377-94-000001.txt</th>\n",
+       "      <td>32377</td>\n",
+       "      <td>ELIZABETHTOWN GAS CO</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>1993-12-13</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1993q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/353944/0000353944-94-000005.txt</th>\n",
+       "      <td>353944</td>\n",
+       "      <td>INTERNATIONAL GAME TECHNOLOGY</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>1993-12-23</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1993q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/60512/0000060512-94-000006.txt</th>\n",
+       "      <td>60512</td>\n",
+       "      <td>LOUISIANA LAND &amp; EXPLORATION CO</td>\n",
+       "      <td>10-K/A</td>\n",
+       "      <td>1993-10-07</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1993q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/932021/0001493152-23-046428.txt</th>\n",
+       "      <td>932021</td>\n",
+       "      <td>GLOBAL TECHNOLOGIES LTD</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>2023-12-29</td>\n",
+       "      <td>21.1</td>\n",
+       "      <td>2023q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/933974/0001558370-23-019262.txt</th>\n",
+       "      <td>933974</td>\n",
+       "      <td>Azenta, Inc.</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>2023-11-21</td>\n",
+       "      <td>21.0</td>\n",
+       "      <td>2023q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/935419/0001628280-23-041580.txt</th>\n",
+       "      <td>935419</td>\n",
+       "      <td>RCI HOSPITALITY HOLDINGS, INC.</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>2023-12-14</td>\n",
+       "      <td>21.1</td>\n",
+       "      <td>2023q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/936395/0000936395-23-000044.txt</th>\n",
+       "      <td>936395</td>\n",
+       "      <td>CIENA CORP</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>2023-12-15</td>\n",
+       "      <td>21.1</td>\n",
+       "      <td>2023q4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/936528/0000936528-23-000207.txt</th>\n",
+       "      <td>936528</td>\n",
+       "      <td>WAFD INC</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>2023-11-17</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2023q4</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>290379 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               cik  \\\n",
+       "filename                                             \n",
+       "edgar/data/17206/0000017206-94-000007.txt    17206   \n",
+       "edgar/data/29082/0000950131-94-000021.txt    29082   \n",
+       "edgar/data/32377/0000032377-94-000001.txt    32377   \n",
+       "edgar/data/353944/0000353944-94-000005.txt  353944   \n",
+       "edgar/data/60512/0000060512-94-000006.txt    60512   \n",
+       "...                                            ...   \n",
+       "edgar/data/932021/0001493152-23-046428.txt  932021   \n",
+       "edgar/data/933974/0001558370-23-019262.txt  933974   \n",
+       "edgar/data/935419/0001628280-23-041580.txt  935419   \n",
+       "edgar/data/936395/0000936395-23-000044.txt  936395   \n",
+       "edgar/data/936528/0000936528-23-000207.txt  936528   \n",
+       "\n",
+       "                                                               company_name  \\\n",
+       "filename                                                                      \n",
+       "edgar/data/17206/0000017206-94-000007.txt              CAPITAL HOLDING CORP   \n",
+       "edgar/data/29082/0000950131-94-000021.txt                    DISNEY WALT CO   \n",
+       "edgar/data/32377/0000032377-94-000001.txt              ELIZABETHTOWN GAS CO   \n",
+       "edgar/data/353944/0000353944-94-000005.txt    INTERNATIONAL GAME TECHNOLOGY   \n",
+       "edgar/data/60512/0000060512-94-000006.txt   LOUISIANA LAND & EXPLORATION CO   \n",
+       "...                                                                     ...   \n",
+       "edgar/data/932021/0001493152-23-046428.txt          GLOBAL TECHNOLOGIES LTD   \n",
+       "edgar/data/933974/0001558370-23-019262.txt                     Azenta, Inc.   \n",
+       "edgar/data/935419/0001628280-23-041580.txt   RCI HOSPITALITY HOLDINGS, INC.   \n",
+       "edgar/data/936395/0000936395-23-000044.txt                       CIENA CORP   \n",
+       "edgar/data/936528/0000936528-23-000207.txt                         WAFD INC   \n",
+       "\n",
+       "                                           form_type  date_filed  \\\n",
+       "filename                                                           \n",
+       "edgar/data/17206/0000017206-94-000007.txt     10-K/A  1993-12-22   \n",
+       "edgar/data/29082/0000950131-94-000021.txt       10-K  1993-12-22   \n",
+       "edgar/data/32377/0000032377-94-000001.txt       10-K  1993-12-13   \n",
+       "edgar/data/353944/0000353944-94-000005.txt      10-K  1993-12-23   \n",
+       "edgar/data/60512/0000060512-94-000006.txt     10-K/A  1993-10-07   \n",
+       "...                                              ...         ...   \n",
+       "edgar/data/932021/0001493152-23-046428.txt      10-K  2023-12-29   \n",
+       "edgar/data/933974/0001558370-23-019262.txt      10-K  2023-11-21   \n",
+       "edgar/data/935419/0001628280-23-041580.txt      10-K  2023-12-14   \n",
+       "edgar/data/936395/0000936395-23-000044.txt      10-K  2023-12-15   \n",
+       "edgar/data/936528/0000936528-23-000207.txt      10-K  2023-11-17   \n",
+       "\n",
+       "                                           exhibit_21_version year_quarter  \n",
+       "filename                                                                    \n",
+       "edgar/data/17206/0000017206-94-000007.txt                None       1993q4  \n",
+       "edgar/data/29082/0000950131-94-000021.txt                  21       1993q4  \n",
+       "edgar/data/32377/0000032377-94-000001.txt                  21       1993q4  \n",
+       "edgar/data/353944/0000353944-94-000005.txt                 21       1993q4  \n",
+       "edgar/data/60512/0000060512-94-000006.txt                None       1993q4  \n",
+       "...                                                       ...          ...  \n",
+       "edgar/data/932021/0001493152-23-046428.txt               21.1       2023q4  \n",
+       "edgar/data/933974/0001558370-23-019262.txt               21.0       2023q4  \n",
+       "edgar/data/935419/0001628280-23-041580.txt               21.1       2023q4  \n",
+       "edgar/data/936395/0000936395-23-000044.txt               21.1       2023q4  \n",
+       "edgar/data/936528/0000936528-23-000207.txt               None       2023q4  \n",
+       "\n",
+       "[290379 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "md"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "bb94754e-3765-43f2-a5e1-8b55a4021da4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame()\n",
+    "dir_name = Path(\"paragraph_layout_md\")\n",
+    "for filename in os.listdir(dir_name):\n",
+    "    if filename.split(\".\")[-1] != \"parquet\":\n",
+    "        continue\n",
+    "    yq_df = pd.read_parquet(dir_name / filename)\n",
+    "    df = pd.concat([df, yq_df])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "52828dfa-a951-4bc5-88a1-f8c2dca2628b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>paragraph</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1011174-0001193125-10-030674</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1010612-0000950123-10-019499</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1003410-0001193125-10-046549</th>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1011308-0000921895-10-000357</th>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1009672-0000950123-10-018301</th>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>898293-0000950144-04-010550</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>894490-0001193125-04-212822</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930803-0000950136-04-004585</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>893430-0001193125-04-212647</th>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>920354-0000950135-04-005647</th>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>98712 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              paragraph\n",
+       "1011174-0001193125-10-030674      False\n",
+       "1010612-0000950123-10-019499      False\n",
+       "1003410-0001193125-10-046549       True\n",
+       "1011308-0000921895-10-000357       True\n",
+       "1009672-0000950123-10-018301       True\n",
+       "...                                 ...\n",
+       "898293-0000950144-04-010550       False\n",
+       "894490-0001193125-04-212822       False\n",
+       "930803-0000950136-04-004585       False\n",
+       "893430-0001193125-04-212647       False\n",
+       "920354-0000950135-04-005647        True\n",
+       "\n",
+       "[98712 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "94b2ecbc-1e08-4b3a-835f-a10327f88298",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df.loc[:, \"full_filename\"] = \"edgar/data/\" + df.index.str.replace('-', '/', n=1) + \".txt\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b9c56e81-3e98-44bf-8c70-256ce1d58d80",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "md[\"date_filed\"] = md[\"date_filed\"].astype(\"datetime64[ns]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d60efebc-72ff-41e8-b765-8edcadbe185e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>paragraph</th>\n",
+       "      <th>full_filename</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1011174-0001193125-10-030674</th>\n",
+       "      <td>False</td>\n",
+       "      <td>edgar/data/1011174/0001193125-10-030674.txt</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1010612-0000950123-10-019499</th>\n",
+       "      <td>False</td>\n",
+       "      <td>edgar/data/1010612/0000950123-10-019499.txt</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              paragraph  \\\n",
+       "1011174-0001193125-10-030674      False   \n",
+       "1010612-0000950123-10-019499      False   \n",
+       "\n",
+       "                                                            full_filename  \n",
+       "1011174-0001193125-10-030674  edgar/data/1011174/0001193125-10-030674.txt  \n",
+       "1010612-0000950123-10-019499  edgar/data/1010612/0000950123-10-019499.txt  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0f6d512f-b07a-4204-b3cf-69e08848ef2d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.27785882162249775"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# what percentage of files are paragraph layout?\n",
+    "md_merged = md.reset_index().merge(df, left_on=\"filename\", right_on=\"full_filename\", how=\"left\", validate=\"1:1\")\n",
+    "md_merged = md_merged.dropna(subset=\"paragraph\")\n",
+    "len(md_merged[md_merged.paragraph])/len(md_merged)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "67e63df0-ca52-4eef-b6aa-a1715f1ab081",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>filename</th>\n",
+       "      <th>cik</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>date_filed</th>\n",
+       "      <th>exhibit_21_version</th>\n",
+       "      <th>year_quarter</th>\n",
+       "      <th>paragraph</th>\n",
+       "      <th>full_filename</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>edgar/data/100240/0000950144-94-000787.txt</td>\n",
+       "      <td>100240</td>\n",
+       "      <td>TURNER BROADCASTING SYSTEM INC</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>1994-03-31</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1994q1</td>\n",
+       "      <td>False</td>\n",
+       "      <td>edgar/data/100240/0000950144-94-000787.txt</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>edgar/data/100885/0000100885-94-000006.txt</td>\n",
+       "      <td>100885</td>\n",
+       "      <td>UNION PACIFIC CORP</td>\n",
+       "      <td>10-K</td>\n",
+       "      <td>1994-03-29</td>\n",
+       "      <td>21</td>\n",
+       "      <td>1994q1</td>\n",
+       "      <td>False</td>\n",
+       "      <td>edgar/data/100885/0000100885-94-000006.txt</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      filename     cik  \\\n",
+       "6   edgar/data/100240/0000950144-94-000787.txt  100240   \n",
+       "11  edgar/data/100885/0000100885-94-000006.txt  100885   \n",
+       "\n",
+       "                      company_name form_type date_filed exhibit_21_version  \\\n",
+       "6   TURNER BROADCASTING SYSTEM INC      10-K 1994-03-31                 21   \n",
+       "11              UNION PACIFIC CORP      10-K 1994-03-29                 21   \n",
+       "\n",
+       "   year_quarter paragraph                               full_filename  \n",
+       "6        1994q1     False  edgar/data/100240/0000950144-94-000787.txt  \n",
+       "11       1994q1     False  edgar/data/100885/0000100885-94-000006.txt  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "md_merged.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "1e11faef-853b-48f2-9eb0-af7f8715cd41",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.10292571287189956"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# what percentage of CIKs are only covered by paragraph layout docs\n",
+    "# get the set of unique CIKs in md_merged\n",
+    "all_ciks = set(md_merged.cik)\n",
+    "# remove the paragraph layout docs\n",
+    "no_paragraph_ciks = set(md_merged[md_merged[\"paragraph\"] == False].cik)\n",
+    "# get the set of CIKs that are in the full set but not the paragraph removed set\n",
+    "only_paragraph_ciks = all_ciks - no_paragraph_ciks\n",
+    "# divide that number by the total number of CIKs\n",
+    "len(only_paragraph_ciks)/len(all_ciks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6062d722-b1c7-4589-975e-7fe8cef65a40",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1664"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(only_paragraph_ciks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b1f6ab8-e3be-48c2-9ecb-346425af3777",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# what percentage of CIK and year-quarter coverage do we get if we exclude all paragraph filings"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mozilla_sec_eia",
+   "language": "python",
+   "name": "mozilla_sec_eia"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
new file mode 100644
index 0000000..111ae84
--- /dev/null
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -0,0 +1,3326 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9029518c-ea19-4055-a938-36a5ea1804d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 262,
+   "id": "1107fe42-197c-4fea-9c48-06d08699af0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n",
+    "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n",
+    "import splink.comparison_library as cl\n",
+    "from splink.exploratory import completeness_chart, profile_columns\n",
+    "from upath import UPath\n",
+    "\n",
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d",
+   "metadata": {},
+   "source": [
+    "# Inputs\n",
+    "\n",
+    "Questions:\n",
+    "* What's the best way to dagsterize this to get EIA data from PUDL?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb6b3f3f-8c30-4810-90dd-75cfbeecc4e0",
+   "metadata": {},
+   "source": [
+    "### EIA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "4ab5594d-7d1f-425d-80e1-92c30be73011",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "3fb7895f-10c5-4450-96f9-77b36471b53e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia_df = raw_eia_df.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "06c76b82-1aad-47b2-aecc-6225a286cc40",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "harvested_df = pd.concat([\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "3b7484de-bbc7-47ba-b408-a1af1183018c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n",
+    "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n",
+    "                how=\"left\", \n",
+    "                left_on=[\"report_date\", \"utility_name_eia\"],\n",
+    "                right_on=[\"report_date\", \"new_parent\"]\n",
+    "               )\n",
+    "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n",
+    "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])\n",
+    "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_df = pd.concat([eia_df, eia861_df])\n",
+    "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")\n",
+    "# not sure at what point this stops being a datetime\n",
+    "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")\n",
+    "# there are nulls from non harvested 861 utilities\n",
+    "eia_df = eia_df.dropna(subset=\"utility_name_eia\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
+   "metadata": {},
+   "source": [
+    "### SEC 10K Basic Info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_path = UPath(\"gs://sec10k-outputs/v2/basic_10k_company_info\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_sec_df = pd.DataFrame()\n",
+    "for file in sec_path.iterdir():\n",
+    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
+    "        raw_sec_df = pd.concat([raw_sec_df, pd.read_parquet(sec_path / file.name)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "23da5ca1-bd04-44d4-b252-7b114d6d553f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>value</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>filename</th>\n",
+       "      <th>filer_count</th>\n",
+       "      <th>block</th>\n",
+       "      <th>block_count</th>\n",
+       "      <th>key</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"5\" valign=\"top\">edgar/data/100240/0000950144-94-000787.txt</th>\n",
+       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
+       "      <th rowspan=\"5\" valign=\"top\">company_data</th>\n",
+       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
+       "      <th>company_conformed_name</th>\n",
+       "      <td>turner broadcasting system inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>central_index_key</th>\n",
+       "      <td>0000100240</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>standard_industrial_classification</th>\n",
+       "      <td>4833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>irs_number</th>\n",
+       "      <td>580950695</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>state_of_incorporation</th>\n",
+       "      <td>ga</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"5\" valign=\"top\">edgar/data/936528/0000936528-23-000207.txt</th>\n",
+       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
+       "      <th rowspan=\"5\" valign=\"top\">former_company</th>\n",
+       "      <th>0</th>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <td>20230928</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
+       "      <th>former_conformed_name</th>\n",
+       "      <td>wafd inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <td>20230927</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
+       "      <th>former_conformed_name</th>\n",
+       "      <td>washington federal inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <td>19950206</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>7980908 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                               value\n",
+       "filename                                   filer_count block          block_count key                                                               \n",
+       "edgar/data/100240/0000950144-94-000787.txt 0           company_data   0           company_conformed_name              turner broadcasting system inc\n",
+       "                                                                                  central_index_key                                       0000100240\n",
+       "                                                                                  standard_industrial_classification                            4833\n",
+       "                                                                                  irs_number                                               580950695\n",
+       "                                                                                  state_of_incorporation                                          ga\n",
+       "...                                                                                                                                              ...\n",
+       "edgar/data/936528/0000936528-23-000207.txt 0           former_company 0           date_of_name_change                                       20230928\n",
+       "                                                                      1           former_conformed_name                                     wafd inc\n",
+       "                                                                                  date_of_name_change                                       20230927\n",
+       "                                                                      2           former_conformed_name                       washington federal inc\n",
+       "                                                                                  date_of_name_change                                       19950206\n",
+       "\n",
+       "[7980908 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_sec_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "1be3364e-9887-42b2-b303-0a24e8681acf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
+    "raw_sec_df.columns.name = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2",
+   "metadata": {},
+   "source": [
+    "### Ex. 21"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "611da616-45ef-40ae-bc06-8bfbc871274d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_ex21_df = pd.DataFrame()\n",
+    "for file in ex21_path.iterdir():\n",
+    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
+    "        year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
+    "        report_year = file.name[:4]\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = report_year\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
+    "        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b636d438-ed71-426c-8c2a-9e550fe99958",
+   "metadata": {},
+   "source": [
+    "# Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cleaning on both sides\n",
+    "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:189: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "  df = df.fillna(np.nan)\n"
+     ]
+    }
+   ],
+   "source": [
+    "ex21_clean_df = prepare_ex21_df(raw_ex21_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 224,
+   "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_clean_df = prepare_eia_df(eia_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 228,
+   "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SHARED_COLS = [\n",
+    "    \"record_id\",\n",
+    "    \"report_date\",\n",
+    "    \"report_year\",\n",
+    "    \"company_name\",\n",
+    "    \"street_address\",\n",
+    "    \"street_address_2\",\n",
+    "    \"city\",\n",
+    "    \"state\",  # could use state of incorporation from SEC\n",
+    "    \"zip_code\",\n",
+    "    \"phone_number\",\n",
+    "    \"company_name_mphone\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e90de0d3-3220-4869-80a3-fc7dd381d393",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# strip legal terms and then make a list column from company name\n",
+    "# use this for blocking and comnparison levels\n",
+    "eia_match_df[\"company_name_mphone_list\"] = eia_match_df[\"company_name_mphone\"].str.split()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "460c5bd5-f2e2-45c3-86c3-ac203bd053d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create list column for address information as well"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "id": "c3bdc160-1939-4f34-914f-ecb0b5fdb5ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>street_address_2</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>phone_number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2000-03-30</td>\n",
+       "      <td>2000</td>\n",
+       "      <td>meta group incorporated</td>\n",
+       "      <td>208 harbor dr</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>06912-0061</td>\n",
+       "      <td>2039736700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2001-04-02</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>meta group incorporated</td>\n",
+       "      <td>208 harbor dr</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>06912-0061</td>\n",
+       "      <td>2039736700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2002-04-01</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>meta group incorporated</td>\n",
+       "      <td>208 harbor dr</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>06912-0061</td>\n",
+       "      <td>2039736700</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  report_date  report_year             company_name street_address street_address_2      city state    zip_code phone_number\n",
+       "0  2000-03-30         2000  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700\n",
+       "1  2001-04-02         2001  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700\n",
+       "2  2002-04-01         2002  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700"
+      ]
+     },
+     "execution_count": 158,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_clean_df[SHARED_COLS].head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "id": "9d73fdac-8d97-4030-9772-79ac058b0d33",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>street_address_2</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>phone_number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>desert willow energy storage</td>\n",
+       "      <td>100 bayview circle</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>newport beach</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>portage solar plant</td>\n",
+       "      <td>n8917</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>portage</td>\n",
+       "      <td>wi</td>\n",
+       "      <td>53901</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>nsf energy one limited liability company</td>\n",
+       "      <td>1241 university ave</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>rochester</td>\n",
+       "      <td>ny</td>\n",
+       "      <td>14607</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   report_date  report_year                              company_name       street_address street_address_2           city state zip_code phone_number\n",
+       "33  2023-01-01         2023              desert willow energy storage   100 bayview circle              NaN  newport beach    ca      NaN          NaN\n",
+       "35  2023-01-01         2023                       portage solar plant                n8917              NaN        portage    wi    53901          NaN\n",
+       "37  2023-01-01         2023  nsf energy one limited liability company  1241 university ave              NaN      rochester    ny    14607          NaN"
+      ]
+     },
+     "execution_count": 159,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eia_clean_df[~eia_match_df.street_address.isnull()][SHARED_COLS].head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "id": "db2b1e13-824e-4c86-8065-fc99e9a1186c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>id</th>\n",
+       "      <th>company_name_raw</th>\n",
+       "      <th>loc_of_incorporation</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>14060-0000916131-94-000015</td>\n",
+       "      <td>brenton bank and trust company</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1994</td>\n",
+       "      <td>brenton bank and trust company</td>\n",
+       "      <td>BRNTN BNK ANT TRST KMPN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>14060-0000916131-94-000015</td>\n",
+       "      <td>adel</td>\n",
+       "      <td>iowa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1994</td>\n",
+       "      <td>adel</td>\n",
+       "      <td>ATL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>14060-0000916131-94-000015</td>\n",
+       "      <td>brenton savings bank, fsb united states</td>\n",
+       "      <td>ames, iowa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1994</td>\n",
+       "      <td>brenton savings bank, fsb united states</td>\n",
+       "      <td>BRNTN SFNKS BNK FSB UNTT STTS</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   record_id                          id                         company_name_raw loc_of_incorporation own_per  report_year                             company_name            company_name_mphone\n",
+       "0          0  14060-0000916131-94-000015           brenton bank and trust company                 iowa     NaN         1994           brenton bank and trust company        BRNTN BNK ANT TRST KMPN\n",
+       "1          1  14060-0000916131-94-000015                                     adel                 iowa     NaN         1994                                     adel                            ATL\n",
+       "2          2  14060-0000916131-94-000015  brenton savings bank, fsb united states           ames, iowa     NaN         1994  brenton savings bank, fsb united states  BRNTN SFNKS BNK FSB UNTT STTS"
+      ]
+     },
+     "execution_count": 160,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex21_clean_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 229,
+   "id": "4ea7c80a-5b5b-4a07-bca0-b6ed1e78dce9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['record_id',\n",
+       " 'report_date',\n",
+       " 'report_year',\n",
+       " 'company_name',\n",
+       " 'street_address',\n",
+       " 'street_address_2',\n",
+       " 'city',\n",
+       " 'state',\n",
+       " 'zip_code',\n",
+       " 'phone_number',\n",
+       " 'company_name_mphone']"
+      ]
+     },
+     "execution_count": 229,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "SHARED_COLS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 231,
+   "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_match_df = eia_clean_df[SHARED_COLS]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 232,
+   "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_match_df = sec_clean_df[SHARED_COLS]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a04c196-e926-4502-82ee-c27352352591",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "# Link in Ex. 21 records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 165,
+   "id": "c1500344-ff7f-450e-90dd-1105d8e7c637",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run the Ex.21 to SEC model\n",
+    "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n",
+    "with open(filepath, 'r') as file:\n",
+    "    sec_ex21_settings = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 192,
+   "id": "172ea84f-a0b7-4e9c-b746-322a47663171",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 193,
+   "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 194,
+   "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14125"
+      ]
+     },
+     "execution_count": 194,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sec_test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 195,
+   "id": "ec13db12-3664-4e00-aa83-7c372039b230",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "233101"
+      ]
+     },
+     "execution_count": 195,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(ex21_test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 196,
+   "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>loc_of_incorporation</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>23</td>\n",
+       "      <td>2016</td>\n",
+       "      <td>nicholas financial incorporated</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>NXLS FNNXL INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>24</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>nicholas financial incorporated</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>NXLS FNNXL INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68</th>\n",
+       "      <td>68</td>\n",
+       "      <td>2016</td>\n",
+       "      <td>sandisk corporation</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>SNTSK KRPRXN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    record_id  report_year                     company_name loc_of_incorporation  company_name_mphone\n",
+       "23         23         2016  nicholas financial incorporated              florida  NXLS FNNXL INKRPRTT\n",
+       "24         24         2017  nicholas financial incorporated              florida  NXLS FNNXL INKRPRTT\n",
+       "68         68         2016              sandisk corporation             delaware         SNTSK KRPRXN"
+      ]
+     },
+     "execution_count": 196,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_test_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 197,
+   "id": "e24e2c8f-1124-4e87-b77d-55fca14a7d3c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>loc_of_incorporation</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2832746</th>\n",
+       "      <td>0</td>\n",
+       "      <td>2016</td>\n",
+       "      <td>capstone turbine singapore pte., limited</td>\n",
+       "      <td>singapore</td>\n",
+       "      <td>KPSTN TRBN SNKPR PT LMTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2832747</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2016</td>\n",
+       "      <td>capstone turbine international, incorporated</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KPSTN TRBN INTRNXNL INKRPRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2832748</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2016</td>\n",
+       "      <td>capstone turbine financial services, limited l...</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         record_id  report_year                                       company_name loc_of_incorporation                    company_name_mphone\n",
+       "2832746          0         2016           capstone turbine singapore pte., limited            singapore               KPSTN TRBN SNKPR PT LMTT\n",
+       "2832747          1         2016       capstone turbine international, incorporated             delaware           KPSTN TRBN INTRNXNL INKRPRTT\n",
+       "2832748          2         2016  capstone turbine financial services, limited l...             delaware  KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN"
+      ]
+     },
+     "execution_count": 197,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex21_test_df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 198,
+   "id": "c531657f-5a0a-4ff5-b680-c6a1806feb75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# can we just load this linker and make predictions? what happens with blocking?\n",
+    "sec_ex21_linker = Linker([sec_test_df, ex21_test_df], sec_ex21_settings, db_api=DuckDBAPI())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 199,
+   "id": "14b239db-a816-428c-a132-dca0ed0998c4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Blocking time: 0.44 seconds\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "661a74c00c7e41f59787cad30a26ec78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Predict time: 115.79 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "sec_ex21_preds = sec_ex21_linker.inference.predict(threshold_match_probability=0.6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "id": "08167db9-9d9c-4b09-a839-847f85842324",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_ex21_preds_df = sec_ex21_preds.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "id": "3f349a0a-269a-4f34-95e8-54a8c96c57f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_l</th>\n",
+       "      <th>company_name_r</th>\n",
+       "      <th>gamma_company_name</th>\n",
+       "      <th>tf_company_name_l</th>\n",
+       "      <th>tf_company_name_r</th>\n",
+       "      <th>bf_company_name</th>\n",
+       "      <th>bf_tf_adj_company_name</th>\n",
+       "      <th>loc_of_incorporation_l</th>\n",
+       "      <th>loc_of_incorporation_r</th>\n",
+       "      <th>gamma_loc_of_incorporation</th>\n",
+       "      <th>tf_loc_of_incorporation_l</th>\n",
+       "      <th>tf_loc_of_incorporation_r</th>\n",
+       "      <th>bf_loc_of_incorporation</th>\n",
+       "      <th>bf_tf_adj_loc_of_incorporation</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "      <th>report_year_l</th>\n",
+       "      <th>report_year_r</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>11.726954</td>\n",
+       "      <td>0.999705</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>95551</td>\n",
+       "      <td>5939</td>\n",
+       "      <td>pendrell corporation</td>\n",
+       "      <td>pentzer corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>washington</td>\n",
+       "      <td>washington</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.003427</td>\n",
+       "      <td>0.003427</td>\n",
+       "      <td>2.321780</td>\n",
+       "      <td>60.034545</td>\n",
+       "      <td>PNTRL KRPRXN</td>\n",
+       "      <td>PNTSR KRPRXN</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.981720</td>\n",
+       "      <td>0.663845</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>80041</td>\n",
+       "      <td>1485</td>\n",
+       "      <td>spok holdings, incorporated</td>\n",
+       "      <td>autohaus holdings, incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>2126.980572</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>2.321780</td>\n",
+       "      <td>0.580388</td>\n",
+       "      <td>SPK HLTNKS INKRPRTT</td>\n",
+       "      <td>ATHS HLTNKS INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4.604002</td>\n",
+       "      <td>0.960504</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>72068</td>\n",
+       "      <td>2731</td>\n",
+       "      <td>ashford hospitality trust incorporated</td>\n",
+       "      <td>ashford hospitality trust, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.010087</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>AXFRT HSPTLT TRST INKRPRTT</td>\n",
+       "      <td>AXFRT HSPTLT TRST INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3.901062</td>\n",
+       "      <td>0.937263</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>58652</td>\n",
+       "      <td>1115</td>\n",
+       "      <td>tx holdings, incorporated</td>\n",
+       "      <td>tex holdings, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>georgia</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.005596</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.614319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>TKS HLTNKS INKRPRTT</td>\n",
+       "      <td>TKS HLTNKS INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4.604002</td>\n",
+       "      <td>0.960504</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>82946</td>\n",
+       "      <td>1757</td>\n",
+       "      <td>pharma bio serv, incorporated</td>\n",
+       "      <td>pharma bio serv us, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>FRM B SRF INKRPRTT</td>\n",
+       "      <td>FRM B SRF US INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9343</th>\n",
+       "      <td>0.981720</td>\n",
+       "      <td>0.663845</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>248688</td>\n",
+       "      <td>1135</td>\n",
+       "      <td>transenterix incorporated</td>\n",
+       "      <td>trane brands, incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>2126.980572</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>2.321780</td>\n",
+       "      <td>0.580388</td>\n",
+       "      <td>TRNSNTRKS INKRPRTT</td>\n",
+       "      <td>TRN BRNTS INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9344</th>\n",
+       "      <td>3.901062</td>\n",
+       "      <td>0.937263</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>260283</td>\n",
+       "      <td>3506</td>\n",
+       "      <td>cree incorporated</td>\n",
+       "      <td>j.crew incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000004</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>north carolina</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.004926</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.614319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KR INKRPRTT</td>\n",
+       "      <td>JKR INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2017</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9345</th>\n",
+       "      <td>0.981720</td>\n",
+       "      <td>0.663845</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>232258</td>\n",
+       "      <td>3973</td>\n",
+       "      <td>applied minerals, incorporated</td>\n",
+       "      <td>applied materials spv2, incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>2126.980572</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>2.321780</td>\n",
+       "      <td>0.580388</td>\n",
+       "      <td>APLT MNRLS INKRPRTT</td>\n",
+       "      <td>APLT MTRLS SPF INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9346</th>\n",
+       "      <td>3.901062</td>\n",
+       "      <td>0.937263</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>232258</td>\n",
+       "      <td>3970</td>\n",
+       "      <td>applied minerals, incorporated</td>\n",
+       "      <td>applied materials japan, incorporated</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>35295.437753</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>japan</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.005795</td>\n",
+       "      <td>0.614319</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>APLT MNRLS INKRPRTT</td>\n",
+       "      <td>APLT MTRLS JPN INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9347</th>\n",
+       "      <td>2.724934</td>\n",
+       "      <td>0.868616</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>267563</td>\n",
+       "      <td>285</td>\n",
+       "      <td>guess incorporated</td>\n",
+       "      <td>aquesys, incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>0.000008</td>\n",
+       "      <td>2126.980572</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>us delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.354513</td>\n",
+       "      <td>0.000462</td>\n",
+       "      <td>4.511276</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KS INKRPRTT</td>\n",
+       "      <td>AKSS INKRPRTT</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>2016</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>9348 rows × 24 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                          company_name_l                           company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation       company_name_mphone_l       company_name_mphone_r  report_year_l  report_year_r\n",
+       "0        11.726954           0.999705  __splink__input_table_0  __splink__input_table_1        95551         5939                    pendrell corporation                      pentzer corporation                   3           0.000008           0.000004     35295.437753                     1.0             washington             washington                           3                   0.003427                   0.003427                 2.321780                       60.034545                PNTRL KRPRXN                PNTSR KRPRXN           2017           2017\n",
+       "1         0.981720           0.663845  __splink__input_table_0  __splink__input_table_1        80041         1485             spok holdings, incorporated          autohaus holdings, incorporated                   2           0.000008           0.000004      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388         SPK HLTNKS INKRPRTT        ATHS HLTNKS INKRPRTT           2017           2017\n",
+       "2         4.604002           0.960504  __splink__input_table_0  __splink__input_table_1        72068         2731  ashford hospitality trust incorporated  ashford hospitality trust, incorporated                   3           0.000008           0.000004     35295.437753                     1.0               maryland                   None                          -1                   0.010087                        NaN                 1.000000                        1.000000  AXFRT HSPTLT TRST INKRPRTT  AXFRT HSPTLT TRST INKRPRTT           2017           2017\n",
+       "3         3.901062           0.937263  __splink__input_table_0  __splink__input_table_1        58652         1115               tx holdings, incorporated               tex holdings, incorporated                   3           0.000008           0.000004     35295.437753                     1.0                georgia               delaware                           0                   0.005596                   0.354513                 0.614319                        1.000000         TKS HLTNKS INKRPRTT         TKS HLTNKS INKRPRTT           2017           2017\n",
+       "4         4.604002           0.960504  __splink__input_table_0  __splink__input_table_1        82946         1757           pharma bio serv, incorporated         pharma bio serv us, incorporated                   3           0.000008           0.000004     35295.437753                     1.0                   None               delaware                          -1                        NaN                   0.354513                 1.000000                        1.000000          FRM B SRF INKRPRTT       FRM B SRF US INKRPRTT           2017           2017\n",
+       "...            ...                ...                      ...                      ...          ...          ...                                     ...                                      ...                 ...                ...                ...              ...                     ...                    ...                    ...                         ...                        ...                        ...                      ...                             ...                         ...                         ...            ...            ...\n",
+       "9343      0.981720           0.663845  __splink__input_table_0  __splink__input_table_1       248688         1135               transenterix incorporated               trane brands, incorporated                   2           0.000008           0.000004      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388          TRNSNTRKS INKRPRTT          TRN BRNTS INKRPRTT           2017           2017\n",
+       "9344      3.901062           0.937263  __splink__input_table_0  __splink__input_table_1       260283         3506                       cree incorporated                      j.crew incorporated                   3           0.000008           0.000004     35295.437753                     1.0         north carolina               delaware                           0                   0.004926                   0.354513                 0.614319                        1.000000                 KR INKRPRTT                JKR INKRPRTT           2017           2017\n",
+       "9345      0.981720           0.663845  __splink__input_table_0  __splink__input_table_1       232258         3973          applied minerals, incorporated     applied materials spv2, incorporated                   2           0.000008           0.000008      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388         APLT MNRLS INKRPRTT     APLT MTRLS SPF INKRPRTT           2017           2016\n",
+       "9346      3.901062           0.937263  __splink__input_table_0  __splink__input_table_1       232258         3970          applied minerals, incorporated    applied materials japan, incorporated                   3           0.000008           0.000008     35295.437753                     1.0               delaware                  japan                           0                   0.354513                   0.005795                 0.614319                        1.000000         APLT MNRLS INKRPRTT     APLT MTRLS JPN INKRPRTT           2017           2016\n",
+       "9347      2.724934           0.868616  __splink__input_table_0  __splink__input_table_1       267563          285                      guess incorporated                    aquesys, incorporated                   2           0.000008           0.000008      2126.980572                     1.0               delaware            us delaware                           2                   0.354513                   0.000462                 4.511276                        1.000000                 KS INKRPRTT               AKSS INKRPRTT           2017           2016\n",
+       "\n",
+       "[9348 rows x 24 columns]"
+      ]
+     },
+     "execution_count": 201,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# TODO: this needs to be improved, maybe just do a fuzzy match on string name?\n",
+    "sec_ex21_preds_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "defdf953-4af7-4d43-b7cf-5ae95360d70f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add the Ex. 21 subsidiaries that don't get a matching CIK to the SEC side\n",
+    "# run on all the data\n",
+    "# save the mapping of subsidiaries that are greater than a certain threshold (unclear why the blocking isn't working)\n",
+    "# get the subsidiaries that are less than a certain threshold\n",
+    "# transform them to have columns that match with the SEC df\n",
+    "# add them to the SEC side"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46d967d4-3722-437d-b2f0-37cbac17624f",
+   "metadata": {},
+   "source": [
+    "# Link SEC and EIA"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f",
+   "metadata": {},
+   "source": [
+    "## Exploratory Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 205,
+   "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db_api = DuckDBAPI()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 233,
+   "id": "ac4e560b-6946-4cc7-b2bc-6d5f4b154da6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed details,\n",
+       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-fb2bb6472d120ab63768ece05202f6ba\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-fb2bb6472d120ab63768ece05202f6ba\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 4, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9999826550483704}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 272, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9988190531730652}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 126403, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.45118531584739685}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 107, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9995354413986206}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 206, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9991055727005005}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 619, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9973124265670776}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 6194, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9731069803237915}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 4, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9999826550483704}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.LayerChart(...)"
+      ]
+     },
+     "execution_count": 233,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completeness_chart(sec_match_df, db_api=db_api)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 234,
+   "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed details,\n",
+       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-3736beb63dd0913fa1793471df7936c9\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-3736beb63dd0913fa1793471df7936c9\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 70684, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.599219799041748}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 142421, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.19246907532215118}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 47174, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.7325221300125122}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 19847, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.8874669671058655}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 48235, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.726506233215332}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 164751, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.06585736572742462}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.LayerChart(...)"
+      ]
+     },
+     "execution_count": 234,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completeness_chart(eia_match_df, db_api=db_api)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 209,
+   "id": "c4542c1f-d826-43c1-9af5-ce6473b79d90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# could sub in zip code for street address?\n",
+    "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 210,
+   "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed details,\n",
+       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-d91c69c848924e72ba734dbe839979d9\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-d91c69c848924e72ba734dbe839979d9\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-d91c69c848924e72ba734dbe839979d9\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9821549654006958, \"percentile_inc_nulls\": 0.9821552634239197, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4110.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9618828296661377, \"percentile_inc_nulls\": 0.9618834853172302, \"value_count\": 29, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4669.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9368389844894409, \"percentile_inc_nulls\": 0.9368400573730469, \"value_count\": 28, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5768.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9135101437568665, \"percentile_inc_nulls\": 0.9135116338729858, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5373.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.899511992931366, \"percentile_inc_nulls\": 0.8995137214660645, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3224.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8856180310249329, \"percentile_inc_nulls\": 0.8856199979782104, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3200.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8689452409744263, \"percentile_inc_nulls\": 0.8689475655555725, \"value_count\": 24, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3840.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8524678945541382, \"percentile_inc_nulls\": 0.8524704575538635, \"value_count\": 23, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3795.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8382352590560913, \"percentile_inc_nulls\": 0.8382381200790405, \"value_count\": 22, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3278.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8196347951889038, \"percentile_inc_nulls\": 0.8196378946304321, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4284.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8045250773429871, \"percentile_inc_nulls\": 0.8045284748077393, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3480.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7848911881446838, \"percentile_inc_nulls\": 0.7848949432373047, \"value_count\": 19, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4522.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.761445164680481, \"percentile_inc_nulls\": 0.7614492774009705, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5400.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7384159564971924, \"percentile_inc_nulls\": 0.7384204864501953, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5304.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7129899859428406, \"percentile_inc_nulls\": 0.7129949331283569, \"value_count\": 16, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5856.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.6855711340904236, \"percentile_inc_nulls\": 0.6855765581130981, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6315.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.659433126449585, \"percentile_inc_nulls\": 0.6594390273094177, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6020.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.630138635635376, \"percentile_inc_nulls\": 0.6301450133323669, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6747.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5980435609817505, \"percentile_inc_nulls\": 0.5980505347251892, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7392.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5616500377655029, \"percentile_inc_nulls\": 0.5616576671600342, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8382.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5209668874740601, \"percentile_inc_nulls\": 0.5209751725196838, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9370.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.473879337310791, \"percentile_inc_nulls\": 0.4738885164260864, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10845.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.4248337149620056, \"percentile_inc_nulls\": 0.4248436689376831, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11296.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.3734694719314575, \"percentile_inc_nulls\": 0.3734803795814514, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11830.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.3194654583930969, \"percentile_inc_nulls\": 0.31947726011276245, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 12438.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.2599167823791504, \"percentile_inc_nulls\": 0.2599296569824219, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 13715.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.19405949115753174, \"percentile_inc_nulls\": 0.19407343864440918, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 15168.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.13083326816558838, \"percentile_inc_nulls\": 0.13084840774536133, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14562.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.0675550103187561, \"percentile_inc_nulls\": 0.06757122278213501, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14574.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 1.7344951629638672e-05, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 15559.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4110.0, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 4 values (0.0%) are null and there are 47323 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"sherwin williams company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"simmons first national corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"smith a o corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"unifi incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"universal corp /va/\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"vulcan materials company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"boeing company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"rayonier incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"wesbanco incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"deere john capital corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"ambers stores incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"nx networks incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"hwcc tunica incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"james maritime holdings incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"sportmart incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 30]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8475755453109741, \"percentile_inc_nulls\": 0.8477118611335754, \"value_count\": 35075, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35075.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.7260271310806274, \"percentile_inc_nulls\": 0.7262721061706543, \"value_count\": 27970, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 27970.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6417819261550903, \"percentile_inc_nulls\": 0.6421023011207581, \"value_count\": 19386, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19386.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5964608788490295, \"percentile_inc_nulls\": 0.5968217849731445, \"value_count\": 10429, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10429.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5520437955856323, \"percentile_inc_nulls\": 0.5524444580078125, \"value_count\": 10221, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10221.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5123851299285889, \"percentile_inc_nulls\": 0.5128213167190552, \"value_count\": 9126, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9126.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4752122759819031, \"percentile_inc_nulls\": 0.4756816625595093, \"value_count\": 8554, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8554.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.43829578161239624, \"percentile_inc_nulls\": 0.438798189163208, \"value_count\": 8495, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8495.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.41224348545074463, \"percentile_inc_nulls\": 0.4127691984176636, \"value_count\": 5995, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5995.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3880468010902405, \"percentile_inc_nulls\": 0.3885941505432129, \"value_count\": 5568, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5568.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3645975589752197, \"percentile_inc_nulls\": 0.3651658296585083, \"value_count\": 5396, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5396.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3423563838005066, \"percentile_inc_nulls\": 0.3429446220397949, \"value_count\": 5118, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5118.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.32081925868988037, \"percentile_inc_nulls\": 0.3214266896247864, \"value_count\": 4956, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4956.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2995384931564331, \"percentile_inc_nulls\": 0.30016499757766724, \"value_count\": 4897, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4897.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2785836458206177, \"percentile_inc_nulls\": 0.27922892570495605, \"value_count\": 4822, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4822.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2577331066131592, \"percentile_inc_nulls\": 0.2583969831466675, \"value_count\": 4798, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4798.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.23978984355926514, \"percentile_inc_nulls\": 0.24046975374221802, \"value_count\": 4129, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4129.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.22265487909317017, \"percentile_inc_nulls\": 0.22335010766983032, \"value_count\": 3943, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3943.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.20564591884613037, \"percentile_inc_nulls\": 0.20635634660720825, \"value_count\": 3914, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3914.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1933215856552124, \"percentile_inc_nulls\": 0.1940430998802185, \"value_count\": 2836, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2836.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.18145787715911865, \"percentile_inc_nulls\": 0.18219000101089478, \"value_count\": 2730, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2730.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.16995924711227417, \"percentile_inc_nulls\": 0.17070162296295166, \"value_count\": 2646, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2646.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1585344672203064, \"percentile_inc_nulls\": 0.15928709506988525, \"value_count\": 2629, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2629.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1479397416114807, \"percentile_inc_nulls\": 0.14870178699493408, \"value_count\": 2438, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2438.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.13760137557983398, \"percentile_inc_nulls\": 0.13837271928787231, \"value_count\": 2379, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2379.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12890130281448364, \"percentile_inc_nulls\": 0.12968045473098755, \"value_count\": 2002, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2002.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12041854858398438, \"percentile_inc_nulls\": 0.12120527029037476, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.11287450790405273, \"percentile_inc_nulls\": 0.11366796493530273, \"value_count\": 1736, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1736.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.10543036460876465, \"percentile_inc_nulls\": 0.10623043775558472, \"value_count\": 1713, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1713.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09882056713104248, \"percentile_inc_nulls\": 0.09962660074234009, \"value_count\": 1521, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1521.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09293222427368164, \"percentile_inc_nulls\": 0.09374350309371948, \"value_count\": 1355, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1355.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08733934164047241, \"percentile_inc_nulls\": 0.08815562725067139, \"value_count\": 1287, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1287.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08176815509796143, \"percentile_inc_nulls\": 0.08258944749832153, \"value_count\": 1282, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1282.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07628393173217773, \"percentile_inc_nulls\": 0.07711011171340942, \"value_count\": 1262, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1262.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07103002071380615, \"percentile_inc_nulls\": 0.0718609094619751, \"value_count\": 1209, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1209.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06596297025680542, \"percentile_inc_nulls\": 0.0667983889579773, \"value_count\": 1166, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1166.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06157815456390381, \"percentile_inc_nulls\": 0.06241750717163086, \"value_count\": 1009, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1009.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05761057138442993, \"percentile_inc_nulls\": 0.05845344066619873, \"value_count\": 913, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 913.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.054459989070892334, \"percentile_inc_nulls\": 0.055305659770965576, \"value_count\": 725, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 725.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05156141519546509, \"percentile_inc_nulls\": 0.05240970849990845, \"value_count\": 667, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0487019419670105, \"percentile_inc_nulls\": 0.0495527982711792, \"value_count\": 658, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0459250807762146, \"percentile_inc_nulls\": 0.04677838087081909, \"value_count\": 639, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 639.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04325681924819946, \"percentile_inc_nulls\": 0.044112563133239746, \"value_count\": 614, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 614.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04063636064529419, \"percentile_inc_nulls\": 0.04149442911148071, \"value_count\": 603, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 603.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03809851408004761, \"percentile_inc_nulls\": 0.038958847522735596, \"value_count\": 584, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03566926717758179, \"percentile_inc_nulls\": 0.036531805992126465, \"value_count\": 559, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 559.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03332263231277466, \"percentile_inc_nulls\": 0.0341871976852417, \"value_count\": 540, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.031075894832611084, \"percentile_inc_nulls\": 0.03194248676300049, \"value_count\": 517, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 517.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.029320240020751953, \"percentile_inc_nulls\": 0.030188441276550293, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.027595043182373047, \"percentile_inc_nulls\": 0.02846473455429077, \"value_count\": 397, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 397.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.026056647300720215, \"percentile_inc_nulls\": 0.02692776918411255, \"value_count\": 354, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.024522602558135986, \"percentile_inc_nulls\": 0.025395095348358154, \"value_count\": 353, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 353.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.02299731969833374, \"percentile_inc_nulls\": 0.023871123790740967, \"value_count\": 351, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.021511077880859375, \"percentile_inc_nulls\": 0.022386252880096436, \"value_count\": 342, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.020146548748016357, \"percentile_inc_nulls\": 0.021022915840148926, \"value_count\": 314, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 314.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.018903672695159912, \"percentile_inc_nulls\": 0.019781172275543213, \"value_count\": 286, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.017669498920440674, \"percentile_inc_nulls\": 0.018548130989074707, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.016574382781982422, \"percentile_inc_nulls\": 0.01745396852493286, \"value_count\": 252, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.015501022338867188, \"percentile_inc_nulls\": 0.016381561756134033, \"value_count\": 247, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 247.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.014471113681793213, \"percentile_inc_nulls\": 0.01535254716873169, \"value_count\": 237, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.013471603393554688, \"percentile_inc_nulls\": 0.014353930950164795, \"value_count\": 230, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.012498140335083008, \"percentile_inc_nulls\": 0.013381361961364746, \"value_count\": 224, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.011594235897064209, \"percentile_inc_nulls\": 0.012478291988372803, \"value_count\": 208, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010781586170196533, \"percentile_inc_nulls\": 0.011666357517242432, \"value_count\": 187, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010086297988891602, \"percentile_inc_nulls\": 0.010971665382385254, \"value_count\": 160, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008739173412322998, \"percentile_inc_nulls\": 0.009625732898712158, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008069932460784912, \"percentile_inc_nulls\": 0.008957087993621826, \"value_count\": 154, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007483243942260742, \"percentile_inc_nulls\": 0.008370935916900635, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007044315338134766, \"percentile_inc_nulls\": 0.007932424545288086, \"value_count\": 101, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006609737873077393, \"percentile_inc_nulls\": 0.007498264312744141, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0062621235847473145, \"percentile_inc_nulls\": 0.0071509480476379395, \"value_count\": 80, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005979657173156738, \"percentile_inc_nulls\": 0.006868720054626465, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005710184574127197, \"percentile_inc_nulls\": 0.006599485874176025, \"value_count\": 62, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005188703536987305, \"percentile_inc_nulls\": 0.006078481674194336, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004936695098876953, \"percentile_inc_nulls\": 0.0058266520500183105, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0047324299812316895, \"percentile_inc_nulls\": 0.0056226253509521484, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004536867141723633, \"percentile_inc_nulls\": 0.005427241325378418, \"value_count\": 45, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004384756088256836, \"percentile_inc_nulls\": 0.005275249481201172, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004089295864105225, \"percentile_inc_nulls\": 0.004980027675628662, \"value_count\": 34, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003945887088775635, \"percentile_inc_nulls\": 0.004836738109588623, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0038068294525146484, \"percentile_inc_nulls\": 0.0046977996826171875, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0036807656288146973, \"percentile_inc_nulls\": 0.0045719146728515625, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003559112548828125, \"percentile_inc_nulls\": 0.004450321197509766, \"value_count\": 28, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 28.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003207087516784668, \"percentile_inc_nulls\": 0.004098653793334961, \"value_count\": 27, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002981126308441162, \"percentile_inc_nulls\": 0.0038728713989257812, \"value_count\": 26, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0028768181800842285, \"percentile_inc_nulls\": 0.0037686824798583984, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0027768611907958984, \"percentile_inc_nulls\": 0.0036687850952148438, \"value_count\": 23, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002681255340576172, \"percentile_inc_nulls\": 0.003573298454284668, \"value_count\": 22, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002590000629425049, \"percentile_inc_nulls\": 0.0034821033477783203, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0022423863410949707, \"percentile_inc_nulls\": 0.003134787082672119, \"value_count\": 20, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0019946694374084473, \"percentile_inc_nulls\": 0.0028873085975646973, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0019164681434631348, \"percentile_inc_nulls\": 0.0028091073036193848, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001846909523010254, \"percentile_inc_nulls\": 0.0027396678924560547, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001603543758392334, \"percentile_inc_nulls\": 0.0024965405464172363, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0014340877532958984, \"percentile_inc_nulls\": 0.0023272037506103516, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001277625560760498, \"percentile_inc_nulls\": 0.0021709203720092773, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0011342167854309082, \"percentile_inc_nulls\": 0.0020276308059692383, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001003861427307129, \"percentile_inc_nulls\": 0.0018973350524902344, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0008864998817443848, \"percentile_inc_nulls\": 0.0017801523208618164, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 27.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0008170008659362793, \"percentile_inc_nulls\": 0.001710653305053711, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0005736351013183594, \"percentile_inc_nulls\": 0.0014675259590148926, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0003911256790161133, \"percentile_inc_nulls\": 0.0012851953506469727, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00028246641159057617, \"percentile_inc_nulls\": 0.001176595687866211, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00019556283950805664, \"percentile_inc_nulls\": 0.0010898113250732422, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 9.125471115112305e-05, \"percentile_inc_nulls\": 0.000985562801361084, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 3.910064697265625e-05, \"percentile_inc_nulls\": 0.0009334683418273926, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0008944272994995117, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 35075, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35075.0, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 206 values (0.1%) are null and there are 173 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 35075, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 27970, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 19386, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 10429, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 10221, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 9126, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 8554, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 8495, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 5995, \"group_name\": \"_state_\", \"value\": \"oh\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 5568, \"group_name\": \"_state_\", \"value\": \"va\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"y0\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"p2\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"h9\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 35075]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9145660996437073, \"percentile_inc_nulls\": 0.9146057367324829, \"value_count\": 19668, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19668.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8862184286117554, \"percentile_inc_nulls\": 0.8862712979316711, \"value_count\": 6526, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6526.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8684000968933105, \"percentile_inc_nulls\": 0.8684612512588501, \"value_count\": 4102, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4102.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8539091944694519, \"percentile_inc_nulls\": 0.8539770841598511, \"value_count\": 3336, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3336.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8407561779022217, \"percentile_inc_nulls\": 0.8408301472663879, \"value_count\": 3028, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3028.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8295056819915771, \"percentile_inc_nulls\": 0.8295849561691284, \"value_count\": 2590, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2590.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8183161020278931, \"percentile_inc_nulls\": 0.818400502204895, \"value_count\": 2576, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2576.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8077041506767273, \"percentile_inc_nulls\": 0.8077934980392456, \"value_count\": 2443, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2443.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7973007559776306, \"percentile_inc_nulls\": 0.7973949313163757, \"value_count\": 2395, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2395.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7871319055557251, \"percentile_inc_nulls\": 0.7872307896614075, \"value_count\": 2341, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2341.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7781966924667358, \"percentile_inc_nulls\": 0.7782997488975525, \"value_count\": 2057, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2057.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7695308327674866, \"percentile_inc_nulls\": 0.7696378827095032, \"value_count\": 1995, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1995.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.761177659034729, \"percentile_inc_nulls\": 0.7612886428833008, \"value_count\": 1923, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1923.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7531590461730957, \"percentile_inc_nulls\": 0.7532737255096436, \"value_count\": 1846, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1846.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7454574704170227, \"percentile_inc_nulls\": 0.7455757260322571, \"value_count\": 1773, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1773.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7377602458000183, \"percentile_inc_nulls\": 0.7378820776939392, \"value_count\": 1772, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1772.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7311316132545471, \"percentile_inc_nulls\": 0.7312564849853516, \"value_count\": 1526, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1526.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.72495037317276, \"percentile_inc_nulls\": 0.7250781655311584, \"value_count\": 1423, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1423.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7190427780151367, \"percentile_inc_nulls\": 0.7191733121871948, \"value_count\": 1360, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1360.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.713752031326294, \"percentile_inc_nulls\": 0.7138850688934326, \"value_count\": 1218, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1218.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7086306810379028, \"percentile_inc_nulls\": 0.7087661027908325, \"value_count\": 1179, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1179.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7035441398620605, \"percentile_inc_nulls\": 0.7036818265914917, \"value_count\": 1171, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1171.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6986442804336548, \"percentile_inc_nulls\": 0.6987842917442322, \"value_count\": 1128, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1128.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6939703226089478, \"percentile_inc_nulls\": 0.6941125392913818, \"value_count\": 1076, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6893181800842285, \"percentile_inc_nulls\": 0.6894624829292297, \"value_count\": 1071, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1071.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6846963167190552, \"percentile_inc_nulls\": 0.6848428249359131, \"value_count\": 1064, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1064.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6800962686538696, \"percentile_inc_nulls\": 0.6802448630332947, \"value_count\": 1059, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1059.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.675557017326355, \"percentile_inc_nulls\": 0.6757076978683472, \"value_count\": 1045, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1045.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6712740063667297, \"percentile_inc_nulls\": 0.6714267134666443, \"value_count\": 986, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 986.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6669953465461731, \"percentile_inc_nulls\": 0.6671500205993652, \"value_count\": 985, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 985.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6627557873725891, \"percentile_inc_nulls\": 0.6629124879837036, \"value_count\": 976, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 976.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.658542275428772, \"percentile_inc_nulls\": 0.6587009429931641, \"value_count\": 970, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 970.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6544721722602844, \"percentile_inc_nulls\": 0.6546326875686646, \"value_count\": 937, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 937.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6505627632141113, \"percentile_inc_nulls\": 0.6507250666618347, \"value_count\": 900, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6466923952102661, \"percentile_inc_nulls\": 0.6468565464019775, \"value_count\": 891, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 891.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6429871320724487, \"percentile_inc_nulls\": 0.6431530117988586, \"value_count\": 853, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 853.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6356287002563477, \"percentile_inc_nulls\": 0.6357979774475098, \"value_count\": 847, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1694.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.632053792476654, \"percentile_inc_nulls\": 0.6322247385978699, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6285179257392883, \"percentile_inc_nulls\": 0.6286904811859131, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.624999463558197, \"percentile_inc_nulls\": 0.6251736879348755, \"value_count\": 810, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6215765476226807, \"percentile_inc_nulls\": 0.6217523813247681, \"value_count\": 788, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 788.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6181753277778625, \"percentile_inc_nulls\": 0.6183527708053589, \"value_count\": 783, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6148914098739624, \"percentile_inc_nulls\": 0.6150703430175781, \"value_count\": 756, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6116161942481995, \"percentile_inc_nulls\": 0.6117966175079346, \"value_count\": 754, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 754.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.608384370803833, \"percentile_inc_nulls\": 0.6085663437843323, \"value_count\": 744, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6051656603813171, \"percentile_inc_nulls\": 0.605349063873291, \"value_count\": 741, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 741.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6019772887229919, \"percentile_inc_nulls\": 0.60216224193573, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5987932682037354, \"percentile_inc_nulls\": 0.5989797115325928, \"value_count\": 733, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 733.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5956918001174927, \"percentile_inc_nulls\": 0.5958796739578247, \"value_count\": 714, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5925990343093872, \"percentile_inc_nulls\": 0.5927883386611938, \"value_count\": 712, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.58952796459198, \"percentile_inc_nulls\": 0.5897186398506165, \"value_count\": 707, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 707.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5864698886871338, \"percentile_inc_nulls\": 0.5866620540618896, \"value_count\": 704, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 704.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5835595726966858, \"percentile_inc_nulls\": 0.5837530493736267, \"value_count\": 670, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 670.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5777822732925415, \"percentile_inc_nulls\": 0.5779784917831421, \"value_count\": 665, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5748980045318604, \"percentile_inc_nulls\": 0.575095534324646, \"value_count\": 664, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 664.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5720224380493164, \"percentile_inc_nulls\": 0.5722212791442871, \"value_count\": 662, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 662.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5663493871688843, \"percentile_inc_nulls\": 0.5665508508682251, \"value_count\": 653, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1306.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5635216236114502, \"percentile_inc_nulls\": 0.5637243986129761, \"value_count\": 651, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 651.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5607241988182068, \"percentile_inc_nulls\": 0.5609282851219177, \"value_count\": 644, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.557952880859375, \"percentile_inc_nulls\": 0.5581582188606262, \"value_count\": 638, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 638.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.555185854434967, \"percentile_inc_nulls\": 0.5553925037384033, \"value_count\": 637, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 637.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5524709820747375, \"percentile_inc_nulls\": 0.5526788830757141, \"value_count\": 625, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 625.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5470933318138123, \"percentile_inc_nulls\": 0.5473037958145142, \"value_count\": 619, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1238.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5444697141647339, \"percentile_inc_nulls\": 0.5446813106536865, \"value_count\": 604, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 604.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5418981313705444, \"percentile_inc_nulls\": 0.5421109795570374, \"value_count\": 592, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.539383053779602, \"percentile_inc_nulls\": 0.5395970940589905, \"value_count\": 579, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 579.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.536933183670044, \"percentile_inc_nulls\": 0.5371483564376831, \"value_count\": 564, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5345006585121155, \"percentile_inc_nulls\": 0.5347169041633606, \"value_count\": 560, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5296703577041626, \"percentile_inc_nulls\": 0.5298888683319092, \"value_count\": 556, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1112.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5273073315620422, \"percentile_inc_nulls\": 0.5275269150733948, \"value_count\": 544, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.525026798248291, \"percentile_inc_nulls\": 0.5252474546432495, \"value_count\": 525, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5227984189987183, \"percentile_inc_nulls\": 0.5230201482772827, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5206004977226257, \"percentile_inc_nulls\": 0.5208232402801514, \"value_count\": 506, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5185198187828064, \"percentile_inc_nulls\": 0.5187435150146484, \"value_count\": 479, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 479.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5144105553627014, \"percentile_inc_nulls\": 0.5146361589431763, \"value_count\": 473, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 946.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5123603343963623, \"percentile_inc_nulls\": 0.5125868320465088, \"value_count\": 472, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5083205699920654, \"percentile_inc_nulls\": 0.5085489749908447, \"value_count\": 465, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 930.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5063441395759583, \"percentile_inc_nulls\": 0.5065734386444092, \"value_count\": 455, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5043807029724121, \"percentile_inc_nulls\": 0.5046110153198242, \"value_count\": 452, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5024434328079224, \"percentile_inc_nulls\": 0.5026745796203613, \"value_count\": 446, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 446.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5005277395248413, \"percentile_inc_nulls\": 0.5007598400115967, \"value_count\": 441, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 441.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4986165165901184, \"percentile_inc_nulls\": 0.49884945154190063, \"value_count\": 440, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.49672257900238037, \"percentile_inc_nulls\": 0.49695640802383423, \"value_count\": 436, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 436.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4948808550834656, \"percentile_inc_nulls\": 0.4951155185699463, \"value_count\": 424, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.49305206537246704, \"percentile_inc_nulls\": 0.4932876229286194, \"value_count\": 421, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 421.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4912320375442505, \"percentile_inc_nulls\": 0.4914683699607849, \"value_count\": 419, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 419.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4894424080848694, \"percentile_inc_nulls\": 0.48967957496643066, \"value_count\": 412, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4876744747161865, \"percentile_inc_nulls\": 0.48791247606277466, \"value_count\": 407, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 407.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.48598039150238037, \"percentile_inc_nulls\": 0.4862191677093506, \"value_count\": 390, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.48432105779647827, \"percentile_inc_nulls\": 0.48456060886383057, \"value_count\": 382, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 382.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4826660752296448, \"percentile_inc_nulls\": 0.48290640115737915, \"value_count\": 381, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4810153841972351, \"percentile_inc_nulls\": 0.48125648498535156, \"value_count\": 380, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4793734550476074, \"percentile_inc_nulls\": 0.47961533069610596, \"value_count\": 378, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4777619242668152, \"percentile_inc_nulls\": 0.478004515171051, \"value_count\": 371, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 371.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4761546850204468, \"percentile_inc_nulls\": 0.4763980507850647, \"value_count\": 370, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4729924201965332, \"percentile_inc_nulls\": 0.47323721647262573, \"value_count\": 364, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 728.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.47143298387527466, \"percentile_inc_nulls\": 0.47167855501174927, \"value_count\": 359, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 359.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.46989959478378296, \"percentile_inc_nulls\": 0.4701458811759949, \"value_count\": 353, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 353.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4683836102485657, \"percentile_inc_nulls\": 0.4686306118965149, \"value_count\": 349, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4668763279914856, \"percentile_inc_nulls\": 0.46712398529052734, \"value_count\": 347, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4653733968734741, \"percentile_inc_nulls\": 0.4656217694282532, \"value_count\": 346, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 346.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4638834595680237, \"percentile_inc_nulls\": 0.46413248777389526, \"value_count\": 343, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.46240222454071045, \"percentile_inc_nulls\": 0.46265196800231934, \"value_count\": 341, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 341.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4609253406524658, \"percentile_inc_nulls\": 0.46117573976516724, \"value_count\": 340, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.459452748298645, \"percentile_inc_nulls\": 0.45970386266708374, \"value_count\": 339, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4579976201057434, \"percentile_inc_nulls\": 0.45824939012527466, \"value_count\": 335, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 335.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45655542612075806, \"percentile_inc_nulls\": 0.4568079113960266, \"value_count\": 332, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 332.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4551176428794861, \"percentile_inc_nulls\": 0.45537078380584717, \"value_count\": 331, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 331.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45371025800704956, \"percentile_inc_nulls\": 0.4539640545845032, \"value_count\": 324, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45231592655181885, \"percentile_inc_nulls\": 0.4525703191757202, \"value_count\": 321, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 321.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4509432315826416, \"percentile_inc_nulls\": 0.4511983394622803, \"value_count\": 316, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4496009945869446, \"percentile_inc_nulls\": 0.449856698513031, \"value_count\": 309, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44700342416763306, \"percentile_inc_nulls\": 0.447260320186615, \"value_count\": 299, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 598.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4457089900970459, \"percentile_inc_nulls\": 0.4459664821624756, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44442319869995117, \"percentile_inc_nulls\": 0.4446812868118286, \"value_count\": 296, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44314175844192505, \"percentile_inc_nulls\": 0.443400502204895, \"value_count\": 295, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 295.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4405876398086548, \"percentile_inc_nulls\": 0.4408475160598755, \"value_count\": 294, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43931490182876587, \"percentile_inc_nulls\": 0.4395753741264343, \"value_count\": 293, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 293.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43805521726608276, \"percentile_inc_nulls\": 0.4383162260055542, \"value_count\": 290, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.435544490814209, \"percentile_inc_nulls\": 0.4358066916465759, \"value_count\": 289, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 578.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4342978000640869, \"percentile_inc_nulls\": 0.4345605969429016, \"value_count\": 287, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43305546045303345, \"percentile_inc_nulls\": 0.4333188533782959, \"value_count\": 286, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4318174719810486, \"percentile_inc_nulls\": 0.4320814609527588, \"value_count\": 285, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43059688806533813, \"percentile_inc_nulls\": 0.4308614134788513, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 281.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4293805956840515, \"percentile_inc_nulls\": 0.42964571714401245, \"value_count\": 280, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4270002245903015, \"percentile_inc_nulls\": 0.4272664189338684, \"value_count\": 274, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 548.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42581868171691895, \"percentile_inc_nulls\": 0.4260854721069336, \"value_count\": 272, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42464590072631836, \"percentile_inc_nulls\": 0.4249131679534912, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42349910736083984, \"percentile_inc_nulls\": 0.4237669110298157, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4212229251861572, \"percentile_inc_nulls\": 0.4214918613433838, \"value_count\": 262, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 524.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4189555048942566, \"percentile_inc_nulls\": 0.41922545433044434, \"value_count\": 261, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 522.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4178304672241211, \"percentile_inc_nulls\": 0.41810089349746704, \"value_count\": 259, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.41558903455734253, \"percentile_inc_nulls\": 0.41586053371429443, \"value_count\": 258, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4133650064468384, \"percentile_inc_nulls\": 0.41363751888275146, \"value_count\": 256, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 512.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.41115838289260864, \"percentile_inc_nulls\": 0.4114319086074829, \"value_count\": 254, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 508.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.410072386264801, \"percentile_inc_nulls\": 0.41034644842147827, \"value_count\": 250, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4057633876800537, \"percentile_inc_nulls\": 0.40603941679000854, \"value_count\": 248, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 992.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4046904444694519, \"percentile_inc_nulls\": 0.4049670100212097, \"value_count\": 247, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 247.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4036218523979187, \"percentile_inc_nulls\": 0.4038988947868347, \"value_count\": 246, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4025663137435913, \"percentile_inc_nulls\": 0.4028438925743103, \"value_count\": 243, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.40152817964553833, \"percentile_inc_nulls\": 0.40180617570877075, \"value_count\": 239, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 239.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4004986882209778, \"percentile_inc_nulls\": 0.4007771611213684, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3984571099281311, \"percentile_inc_nulls\": 0.39873653650283813, \"value_count\": 235, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 470.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3974406123161316, \"percentile_inc_nulls\": 0.3977205753326416, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39642852544784546, \"percentile_inc_nulls\": 0.3967089056968689, \"value_count\": 233, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 233.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39343130588531494, \"percentile_inc_nulls\": 0.393713116645813, \"value_count\": 230, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 690.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3924669623374939, \"percentile_inc_nulls\": 0.39274919033050537, \"value_count\": 222, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39057308435440063, \"percentile_inc_nulls\": 0.39085620641708374, \"value_count\": 218, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 436.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3896304965019226, \"percentile_inc_nulls\": 0.38991403579711914, \"value_count\": 217, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 217.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.388696551322937, \"percentile_inc_nulls\": 0.38898056745529175, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38684606552124023, \"percentile_inc_nulls\": 0.3871309757232666, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3859252333641052, \"percentile_inc_nulls\": 0.38621050119400024, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38500869274139404, \"percentile_inc_nulls\": 0.3852943778038025, \"value_count\": 211, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3841007947921753, \"percentile_inc_nulls\": 0.38438695669174194, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3831973075866699, \"percentile_inc_nulls\": 0.3834838271141052, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.382302463054657, \"percentile_inc_nulls\": 0.3825894594192505, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38141196966171265, \"percentile_inc_nulls\": 0.3816993832588196, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3805258870124817, \"percentile_inc_nulls\": 0.3808136582374573, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3796483874320984, \"percentile_inc_nulls\": 0.3799366354942322, \"value_count\": 202, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37877529859542847, \"percentile_inc_nulls\": 0.3790639042854309, \"value_count\": 201, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37790656089782715, \"percentile_inc_nulls\": 0.37819552421569824, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37704646587371826, \"percentile_inc_nulls\": 0.37733590602874756, \"value_count\": 198, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3761950731277466, \"percentile_inc_nulls\": 0.37648487091064453, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37369304895401, \"percentile_inc_nulls\": 0.37398403882980347, \"value_count\": 192, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37286776304244995, \"percentile_inc_nulls\": 0.37315911054611206, \"value_count\": 190, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3712257742881775, \"percentile_inc_nulls\": 0.3715178966522217, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3687758445739746, \"percentile_inc_nulls\": 0.36906909942626953, \"value_count\": 188, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3679679036140442, \"percentile_inc_nulls\": 0.36826157569885254, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3663780689239502, \"percentile_inc_nulls\": 0.36667245626449585, \"value_count\": 183, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3655875325202942, \"percentile_inc_nulls\": 0.3658822774887085, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.364801287651062, \"percentile_inc_nulls\": 0.36509639024734497, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36401939392089844, \"percentile_inc_nulls\": 0.36431485414505005, \"value_count\": 180, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36324185132980347, \"percentile_inc_nulls\": 0.36353766918182373, \"value_count\": 179, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 179.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36170417070388794, \"percentile_inc_nulls\": 0.3620007038116455, \"value_count\": 177, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3594106435775757, \"percentile_inc_nulls\": 0.3597082495689392, \"value_count\": 176, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3578903079032898, \"percentile_inc_nulls\": 0.3581886291503906, \"value_count\": 175, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3571431636810303, \"percentile_inc_nulls\": 0.35744184255599976, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.35565757751464844, \"percentile_inc_nulls\": 0.35595691204071045, \"value_count\": 171, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3534422516822815, \"percentile_inc_nulls\": 0.3537425994873047, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3519740700721741, \"percentile_inc_nulls\": 0.3522750735282898, \"value_count\": 169, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3512442708015442, \"percentile_inc_nulls\": 0.35154569149017334, \"value_count\": 168, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3505188822746277, \"percentile_inc_nulls\": 0.3508206009864807, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.34908539056777954, \"percentile_inc_nulls\": 0.3493878245353699, \"value_count\": 165, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3476606607437134, \"percentile_inc_nulls\": 0.34796369075775146, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 328.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3455886244773865, \"percentile_inc_nulls\": 0.34589266777038574, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.344215989112854, \"percentile_inc_nulls\": 0.3445206880569458, \"value_count\": 158, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3421700596809387, \"percentile_inc_nulls\": 0.3424757122993469, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3414924144744873, \"percentile_inc_nulls\": 0.3417983651161194, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.34014588594436646, \"percentile_inc_nulls\": 0.3404524326324463, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3394855856895447, \"percentile_inc_nulls\": 0.33979249000549316, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3388296961784363, \"percentile_inc_nulls\": 0.33913683891296387, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3381824493408203, \"percentile_inc_nulls\": 0.33848994970321655, \"value_count\": 149, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 149.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33689671754837036, \"percentile_inc_nulls\": 0.3372047543525696, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3362581729888916, \"percentile_inc_nulls\": 0.3365665078163147, \"value_count\": 147, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33562833070755005, \"percentile_inc_nulls\": 0.335936963558197, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3350027799606323, \"percentile_inc_nulls\": 0.3353117108345032, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33189696073532104, \"percentile_inc_nulls\": 0.3322073817253113, \"value_count\": 143, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 715.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3312801718711853, \"percentile_inc_nulls\": 0.33159083127975464, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33005523681640625, \"percentile_inc_nulls\": 0.33036643266677856, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3282307982444763, \"percentile_inc_nulls\": 0.32854288816452026, \"value_count\": 140, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3270232081413269, \"percentile_inc_nulls\": 0.3273358941078186, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3264237642288208, \"percentile_inc_nulls\": 0.3267366886138916, \"value_count\": 138, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3258286714553833, \"percentile_inc_nulls\": 0.326141893863678, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3246471881866455, \"percentile_inc_nulls\": 0.32496094703674316, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.32288795709609985, \"percentile_inc_nulls\": 0.3232024908065796, \"value_count\": 135, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 405.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3217238187789917, \"percentile_inc_nulls\": 0.3220388889312744, \"value_count\": 134, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 268.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.32114607095718384, \"percentile_inc_nulls\": 0.32146143913269043, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3205726742744446, \"percentile_inc_nulls\": 0.32088834047317505, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3188655972480774, \"percentile_inc_nulls\": 0.31918197870254517, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 393.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31717151403427124, \"percentile_inc_nulls\": 0.31748872995376587, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31605076789855957, \"percentile_inc_nulls\": 0.3163685202598572, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31550347805023193, \"percentile_inc_nulls\": 0.31582146883010864, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3138875961303711, \"percentile_inc_nulls\": 0.3142063021659851, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3107079267501831, \"percentile_inc_nulls\": 0.3110281229019165, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 732.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3101823329925537, \"percentile_inc_nulls\": 0.3105027675628662, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30966103076934814, \"percentile_inc_nulls\": 0.3099817633628845, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3081102967262268, \"percentile_inc_nulls\": 0.3084317445755005, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 357.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30708515644073486, \"percentile_inc_nulls\": 0.30740708112716675, \"value_count\": 118, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3065769672393799, \"percentile_inc_nulls\": 0.3068990707397461, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3050653338432312, \"percentile_inc_nulls\": 0.3053881525993347, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30456578731536865, \"percentile_inc_nulls\": 0.30488884449005127, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30309319496154785, \"percentile_inc_nulls\": 0.3034169673919678, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30163371562957764, \"percentile_inc_nulls\": 0.3019581437110901, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3011515140533447, \"percentile_inc_nulls\": 0.3014761805534363, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29971808195114136, \"percentile_inc_nulls\": 0.30004340410232544, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.298297643661499, \"percentile_inc_nulls\": 0.2986236810684204, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29454463720321655, \"percentile_inc_nulls\": 0.29487234354019165, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 864.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29361504316329956, \"percentile_inc_nulls\": 0.29394322633743286, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29177325963974, \"percentile_inc_nulls\": 0.29210227727890015, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.28949278593063354, \"percentile_inc_nulls\": 0.28982287645339966, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.28904104232788086, \"percentile_inc_nulls\": 0.2893713116645813, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2881461977958679, \"percentile_inc_nulls\": 0.2884768843650818, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2854877710342407, \"percentile_inc_nulls\": 0.2858197093009949, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 612.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2841716408729553, \"percentile_inc_nulls\": 0.2845041751861572, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.282881498336792, \"percentile_inc_nulls\": 0.28321462869644165, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2816044092178345, \"percentile_inc_nulls\": 0.28193819522857666, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.27907633781433105, \"percentile_inc_nulls\": 0.279411256313324, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2769913077354431, \"percentile_inc_nulls\": 0.27732717990875244, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2749279737472534, \"percentile_inc_nulls\": 0.2752648591995239, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 475.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2724781036376953, \"percentile_inc_nulls\": 0.2728160619735718, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2712661623954773, \"percentile_inc_nulls\": 0.2716047167778015, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2708665728569031, \"percentile_inc_nulls\": 0.2712053060531616, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2684948444366455, \"percentile_inc_nulls\": 0.26883465051651, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26732200384140015, \"percentile_inc_nulls\": 0.2676624059677124, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2661622166633606, \"percentile_inc_nulls\": 0.26650315523147583, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2646331787109375, \"percentile_inc_nulls\": 0.26497483253479004, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26274365186691284, \"percentile_inc_nulls\": 0.26308614015579224, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26162290573120117, \"percentile_inc_nulls\": 0.2619659900665283, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25903838872909546, \"percentile_inc_nulls\": 0.25938260555267334, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25684911012649536, \"percentile_inc_nulls\": 0.2571943402290344, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2554069757461548, \"percentile_inc_nulls\": 0.2557528614997864, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 332.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2532697916030884, \"percentile_inc_nulls\": 0.25361669063568115, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.252214252948761, \"percentile_inc_nulls\": 0.25256162881851196, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25151926279067993, \"percentile_inc_nulls\": 0.25186699628829956, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24946027994155884, \"percentile_inc_nulls\": 0.24980896711349487, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24607211351394653, \"percentile_inc_nulls\": 0.24642235040664673, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2440652847290039, \"percentile_inc_nulls\": 0.2444164752960205, \"value_count\": 77, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24109411239624023, \"percentile_inc_nulls\": 0.24144667387008667, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2388136386871338, \"percentile_inc_nulls\": 0.23916727304458618, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23784929513931274, \"percentile_inc_nulls\": 0.23820334672927856, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2368980050086975, \"percentile_inc_nulls\": 0.23725253343582153, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2347087264060974, \"percentile_inc_nulls\": 0.23506426811218262, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23285824060440063, \"percentile_inc_nulls\": 0.23321467638015747, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23133796453475952, \"percentile_inc_nulls\": 0.2316950559616089, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23013901710510254, \"percentile_inc_nulls\": 0.23049670457839966, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.22777599096298218, \"percentile_inc_nulls\": 0.22813475131988525, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.22661185264587402, \"percentile_inc_nulls\": 0.22697114944458008, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 268.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2246050238609314, \"percentile_inc_nulls\": 0.22496527433395386, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2212168574333191, \"percentile_inc_nulls\": 0.2215786576271057, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21760284900665283, \"percentile_inc_nulls\": 0.21796631813049316, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 832.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21486622095108032, \"percentile_inc_nulls\": 0.2152310013771057, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 630.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21217310428619385, \"percentile_inc_nulls\": 0.21253907680511475, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 620.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.20925837755203247, \"percentile_inc_nulls\": 0.2096257209777832, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 671.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2082158923149109, \"percentile_inc_nulls\": 0.20858371257781982, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.206165611743927, \"percentile_inc_nulls\": 0.20653438568115234, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.20415008068084717, \"percentile_inc_nulls\": 0.20451980829238892, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2011789083480835, \"percentile_inc_nulls\": 0.20155000686645508, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19874638319015503, \"percentile_inc_nulls\": 0.19911861419677734, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19611835479736328, \"percentile_inc_nulls\": 0.19649183750152588, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 605.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1944764256477356, \"percentile_inc_nulls\": 0.1948506236076355, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19171375036239624, \"percentile_inc_nulls\": 0.1920892596244812, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19035851955413818, \"percentile_inc_nulls\": 0.1907346248626709, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.18858623504638672, \"percentile_inc_nulls\": 0.1889631748199463, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.185979962348938, \"percentile_inc_nulls\": 0.18635809421539307, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 600.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1830000877380371, \"percentile_inc_nulls\": 0.1833796501159668, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 686.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.18028956651687622, \"percentile_inc_nulls\": 0.1806703805923462, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.17763549089431763, \"percentile_inc_nulls\": 0.17801755666732788, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 611.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1750379204750061, \"percentile_inc_nulls\": 0.17542117834091187, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 598.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.17151939868927002, \"percentile_inc_nulls\": 0.1719043254852295, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16960811614990234, \"percentile_inc_nulls\": 0.16999393701553345, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.167927086353302, \"percentile_inc_nulls\": 0.1683136224746704, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 387.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16555535793304443, \"percentile_inc_nulls\": 0.16594302654266357, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16324013471603394, \"percentile_inc_nulls\": 0.16362887620925903, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16046011447906494, \"percentile_inc_nulls\": 0.16085010766983032, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15808838605880737, \"percentile_inc_nulls\": 0.1584795117378235, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1557774543762207, \"percentile_inc_nulls\": 0.15616965293884277, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15256303548812866, \"percentile_inc_nulls\": 0.1529567837715149, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 740.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15131205320358276, \"percentile_inc_nulls\": 0.15170633792877197, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.148271381855011, \"percentile_inc_nulls\": 0.1486670970916748, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.14546531438827515, \"percentile_inc_nulls\": 0.14586228132247925, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 646.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.14202499389648438, \"percentile_inc_nulls\": 0.1424235701560974, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 792.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1379939317703247, \"percentile_inc_nulls\": 0.13839441537857056, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.13462752103805542, \"percentile_inc_nulls\": 0.13502949476242065, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 775.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.12980586290359497, \"percentile_inc_nulls\": 0.13021016120910645, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1110.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1256488561630249, \"percentile_inc_nulls\": 0.1260550618171692, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 957.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1206621527671814, \"percentile_inc_nulls\": 0.12107068300247192, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1148.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.11585360765457153, \"percentile_inc_nulls\": 0.11626434326171875, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1107.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.11280423402786255, \"percentile_inc_nulls\": 0.11321640014648438, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.10932916402816772, \"percentile_inc_nulls\": 0.10974293947219849, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 800.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.10568040609359741, \"percentile_inc_nulls\": 0.10609585046768188, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09978586435317993, \"percentile_inc_nulls\": 0.10020405054092407, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1357.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09586775302886963, \"percentile_inc_nulls\": 0.0962877869606018, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 902.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09130674600601196, \"percentile_inc_nulls\": 0.09172892570495605, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.08800548315048218, \"percentile_inc_nulls\": 0.08842915296554565, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.08363121747970581, \"percentile_inc_nulls\": 0.08405697345733643, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1007.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07854896783828735, \"percentile_inc_nulls\": 0.07897704839706421, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1170.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07478290796279907, \"percentile_inc_nulls\": 0.07521277666091919, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 867.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07096034288406372, \"percentile_inc_nulls\": 0.0713919997215271, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.06633424758911133, \"percentile_inc_nulls\": 0.06676799058914185, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1065.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.06256377696990967, \"percentile_inc_nulls\": 0.06299930810928345, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 868.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.05793333053588867, \"percentile_inc_nulls\": 0.05837094783782959, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1066.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.05444091558456421, \"percentile_inc_nulls\": 0.05488014221191406, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 804.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.050666093826293945, \"percentile_inc_nulls\": 0.051107168197631836, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 869.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.045974791049957275, \"percentile_inc_nulls\": 0.04641801118850708, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1080.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.04089254140853882, \"percentile_inc_nulls\": 0.04133814573287964, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1170.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.03585374355316162, \"percentile_inc_nulls\": 0.03630167245864868, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1160.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0304718017578125, \"percentile_inc_nulls\": 0.03092217445373535, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1239.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.025806546211242676, \"percentile_inc_nulls\": 0.026259124279022217, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1074.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0207025408744812, \"percentile_inc_nulls\": 0.021157503128051758, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1175.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.014812350273132324, \"percentile_inc_nulls\": 0.015270054340362549, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1356.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.009208858013153076, \"percentile_inc_nulls\": 0.009669184684753418, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1290.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.003831207752227783, \"percentile_inc_nulls\": 0.004294037818908691, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1238.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00046455860137939453, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 882.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 19668, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19668.0, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 107 values (0.0%) are null and there are 5233 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 19668, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 6526, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 4102, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 3336, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 3028, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2590, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2576, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2443, \"group_name\": \"_city_\", \"value\": \"atlanta\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2395, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2341, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"del. miguel hidalgo\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"restonn\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"france\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"alachva\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"la plata,\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 19668]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9960616827011108, \"percentile_inc_nulls\": 0.9960663318634033, \"value_count\": 906, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 906.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9925928711891174, \"percentile_inc_nulls\": 0.9926015734672546, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9893587231636047, \"percentile_inc_nulls\": 0.9893712997436523, \"value_count\": 744, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9866940975189209, \"percentile_inc_nulls\": 0.9867097735404968, \"value_count\": 613, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.984146773815155, \"percentile_inc_nulls\": 0.9841654896736145, \"value_count\": 586, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9820907115936279, \"percentile_inc_nulls\": 0.9821118712425232, \"value_count\": 473, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 473.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9800606966018677, \"percentile_inc_nulls\": 0.980084240436554, \"value_count\": 467, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 467.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9781697988510132, \"percentile_inc_nulls\": 0.9781955480575562, \"value_count\": 435, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9763049483299255, \"percentile_inc_nulls\": 0.9763329029083252, \"value_count\": 429, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9745357632637024, \"percentile_inc_nulls\": 0.9745658040046692, \"value_count\": 407, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 407.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.97307950258255, \"percentile_inc_nulls\": 0.9731113314628601, \"value_count\": 335, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 335.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.971675455570221, \"percentile_inc_nulls\": 0.9717089533805847, \"value_count\": 323, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9704322814941406, \"percentile_inc_nulls\": 0.9704671502113342, \"value_count\": 286, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9692150950431824, \"percentile_inc_nulls\": 0.9692514538764954, \"value_count\": 280, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9680327773094177, \"percentile_inc_nulls\": 0.9680705070495605, \"value_count\": 272, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9668765068054199, \"percentile_inc_nulls\": 0.9669156074523926, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9657419323921204, \"percentile_inc_nulls\": 0.9657824039459229, \"value_count\": 261, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9646291136741638, \"percentile_inc_nulls\": 0.9646708965301514, \"value_count\": 256, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9635206460952759, \"percentile_inc_nulls\": 0.9635637402534485, \"value_count\": 255, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9624295830726624, \"percentile_inc_nulls\": 0.9624739289283752, \"value_count\": 251, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9614384770393372, \"percentile_inc_nulls\": 0.9614840149879456, \"value_count\": 228, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9604865312576294, \"percentile_inc_nulls\": 0.9605331420898438, \"value_count\": 219, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9595432281494141, \"percentile_inc_nulls\": 0.9595910310745239, \"value_count\": 217, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 217.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9586129784584045, \"percentile_inc_nulls\": 0.9586618542671204, \"value_count\": 214, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.957704484462738, \"percentile_inc_nulls\": 0.9577544331550598, \"value_count\": 209, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9568394422531128, \"percentile_inc_nulls\": 0.9568904042243958, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9559961557388306, \"percentile_inc_nulls\": 0.9560481309890747, \"value_count\": 194, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9551658630371094, \"percentile_inc_nulls\": 0.9552188515663147, \"value_count\": 191, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9543443322181702, \"percentile_inc_nulls\": 0.9543982148170471, \"value_count\": 189, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9535270929336548, \"percentile_inc_nulls\": 0.9535819888114929, \"value_count\": 188, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9527316093444824, \"percentile_inc_nulls\": 0.9527873992919922, \"value_count\": 183, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9511493444442749, \"percentile_inc_nulls\": 0.9512070417404175, \"value_count\": 182, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9504103660583496, \"percentile_inc_nulls\": 0.950468897819519, \"value_count\": 170, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9489932656288147, \"percentile_inc_nulls\": 0.9490534663200378, \"value_count\": 163, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 326.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9482933878898621, \"percentile_inc_nulls\": 0.9483544826507568, \"value_count\": 161, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9469023942947388, \"percentile_inc_nulls\": 0.9469650983810425, \"value_count\": 160, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9462199211120605, \"percentile_inc_nulls\": 0.9462834596633911, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9455461502075195, \"percentile_inc_nulls\": 0.9456104636192322, \"value_count\": 155, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9449158310890198, \"percentile_inc_nulls\": 0.9449809193611145, \"value_count\": 145, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9443159699440002, \"percentile_inc_nulls\": 0.9443817138671875, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9437291622161865, \"percentile_inc_nulls\": 0.9437955617904663, \"value_count\": 135, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9431466460227966, \"percentile_inc_nulls\": 0.9432138204574585, \"value_count\": 134, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9425771832466125, \"percentile_inc_nulls\": 0.9426450133323669, \"value_count\": 131, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9420294761657715, \"percentile_inc_nulls\": 0.9420979619026184, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.941486120223999, \"percentile_inc_nulls\": 0.9415552020072937, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9409471154212952, \"percentile_inc_nulls\": 0.9410168528556824, \"value_count\": 124, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9404211044311523, \"percentile_inc_nulls\": 0.9404914975166321, \"value_count\": 121, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9399082064628601, \"percentile_inc_nulls\": 0.9399791359901428, \"value_count\": 118, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9389083981513977, \"percentile_inc_nulls\": 0.9389805793762207, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9384128451347351, \"percentile_inc_nulls\": 0.9384855628013611, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9379216432571411, \"percentile_inc_nulls\": 0.9379949569702148, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9374391436576843, \"percentile_inc_nulls\": 0.937512993812561, \"value_count\": 111, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9369609951972961, \"percentile_inc_nulls\": 0.9370354413986206, \"value_count\": 110, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9364871978759766, \"percentile_inc_nulls\": 0.936562180519104, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9360220432281494, \"percentile_inc_nulls\": 0.9360976219177246, \"value_count\": 107, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9355655908584595, \"percentile_inc_nulls\": 0.9356417059898376, \"value_count\": 105, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9346614480018616, \"percentile_inc_nulls\": 0.9347386360168457, \"value_count\": 104, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9342137575149536, \"percentile_inc_nulls\": 0.934291422367096, \"value_count\": 103, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9337703585624695, \"percentile_inc_nulls\": 0.9338485598564148, \"value_count\": 102, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.933331310749054, \"percentile_inc_nulls\": 0.9334100484848022, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9329009652137756, \"percentile_inc_nulls\": 0.9329801797866821, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9324749708175659, \"percentile_inc_nulls\": 0.9325547218322754, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9320533275604248, \"percentile_inc_nulls\": 0.9321335554122925, \"value_count\": 97, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 97.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.931218683719635, \"percentile_inc_nulls\": 0.9312999248504639, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9303928017616272, \"percentile_inc_nulls\": 0.9304749965667725, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9295755624771118, \"percentile_inc_nulls\": 0.9296587109565735, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9287670254707336, \"percentile_inc_nulls\": 0.9288511872291565, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9271673560142517, \"percentile_inc_nulls\": 0.9272533655166626, \"value_count\": 92, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9267718195915222, \"percentile_inc_nulls\": 0.9268583059310913, \"value_count\": 91, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9263805747032166, \"percentile_inc_nulls\": 0.9264675378799438, \"value_count\": 90, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9259936809539795, \"percentile_inc_nulls\": 0.926081120967865, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9256155490875244, \"percentile_inc_nulls\": 0.9257033467292786, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9244940280914307, \"percentile_inc_nulls\": 0.9245831966400146, \"value_count\": 86, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.924124538898468, \"percentile_inc_nulls\": 0.9242141246795654, \"value_count\": 85, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9230291247367859, \"percentile_inc_nulls\": 0.9231200218200684, \"value_count\": 84, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9226683378219604, \"percentile_inc_nulls\": 0.9227596521377563, \"value_count\": 83, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9209295511245728, \"percentile_inc_nulls\": 0.921022891998291, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.920586109161377, \"percentile_inc_nulls\": 0.9206799268722534, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9192299246788025, \"percentile_inc_nulls\": 0.9193252921104431, \"value_count\": 78, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9185605049133301, \"percentile_inc_nulls\": 0.9186566472053528, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9172390103340149, \"percentile_inc_nulls\": 0.9173367619514465, \"value_count\": 76, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9169129729270935, \"percentile_inc_nulls\": 0.9170111417770386, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9162870645523071, \"percentile_inc_nulls\": 0.9163858890533447, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9150525331497192, \"percentile_inc_nulls\": 0.9151528477668762, \"value_count\": 71, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9147482514381409, \"percentile_inc_nulls\": 0.9148489236831665, \"value_count\": 70, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9141483306884766, \"percentile_inc_nulls\": 0.9142497181892395, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9135571718215942, \"percentile_inc_nulls\": 0.9136592745780945, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9132659435272217, \"percentile_inc_nulls\": 0.9133683443069458, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9129833579063416, \"percentile_inc_nulls\": 0.9130861163139343, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9118705987930298, \"percentile_inc_nulls\": 0.9119746685028076, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.911322832107544, \"percentile_inc_nulls\": 0.9114275574684143, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9107925295829773, \"percentile_inc_nulls\": 0.9108978509902954, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9100100994110107, \"percentile_inc_nulls\": 0.9101163744926453, \"value_count\": 60, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9092406630516052, \"percentile_inc_nulls\": 0.9093478918075562, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9089885354042053, \"percentile_inc_nulls\": 0.9090960621833801, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9079974889755249, \"percentile_inc_nulls\": 0.9081060886383057, \"value_count\": 57, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.907267153263092, \"percentile_inc_nulls\": 0.9073767066001892, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9060717821121216, \"percentile_inc_nulls\": 0.9061827063560486, \"value_count\": 55, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9046633839607239, \"percentile_inc_nulls\": 0.9047759771347046, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9035331606864929, \"percentile_inc_nulls\": 0.9036471247673035, \"value_count\": 52, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9022030234336853, \"percentile_inc_nulls\": 0.90231853723526, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9013336300849915, \"percentile_inc_nulls\": 0.9014501571655273, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9000556468963623, \"percentile_inc_nulls\": 0.9001736640930176, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8992210626602173, \"percentile_inc_nulls\": 0.899340033531189, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8981994986534119, \"percentile_inc_nulls\": 0.8983197212219238, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8965998291969299, \"percentile_inc_nulls\": 0.8967219591140747, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8942524790763855, \"percentile_inc_nulls\": 0.8943774104118347, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8934874534606934, \"percentile_inc_nulls\": 0.8936132192611694, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8916182518005371, \"percentile_inc_nulls\": 0.8917462825775146, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.890340268611908, \"percentile_inc_nulls\": 0.8904697895050049, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8883798122406006, \"percentile_inc_nulls\": 0.8885116577148438, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 451.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8862932920455933, \"percentile_inc_nulls\": 0.8864275813102722, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8842589259147644, \"percentile_inc_nulls\": 0.8843955993652344, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.882607102394104, \"percentile_inc_nulls\": 0.8827457427978516, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8798729181289673, \"percentile_inc_nulls\": 0.8800147771835327, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 629.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8773690462112427, \"percentile_inc_nulls\": 0.8775138854980469, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8755433559417725, \"percentile_inc_nulls\": 0.8756903409957886, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8730308413505554, \"percentile_inc_nulls\": 0.8731808066368103, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 578.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8700184226036072, \"percentile_inc_nulls\": 0.8701719045639038, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 693.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8672363758087158, \"percentile_inc_nulls\": 0.8673931956291199, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8641370534896851, \"percentile_inc_nulls\": 0.8642975091934204, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 713.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8542260527610779, \"percentile_inc_nulls\": 0.854398250579834, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2280.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8430066704750061, \"percentile_inc_nulls\": 0.8431921005249023, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2581.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8309569954872131, \"percentile_inc_nulls\": 0.8311566710472107, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2772.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8173424601554871, \"percentile_inc_nulls\": 0.8175581693649292, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3132.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8058144450187683, \"percentile_inc_nulls\": 0.8060437440872192, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2652.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7959251999855042, \"percentile_inc_nulls\": 0.796166181564331, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2275.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7849709987640381, \"percentile_inc_nulls\": 0.7852249145507812, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2520.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7717736959457397, \"percentile_inc_nulls\": 0.7720432281494141, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3036.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7584808468818665, \"percentile_inc_nulls\": 0.7587660551071167, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3058.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7432361841201782, \"percentile_inc_nulls\": 0.7435394525527954, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3507.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7268048524856567, \"percentile_inc_nulls\": 0.7271274328231812, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3780.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7090476751327515, \"percentile_inc_nulls\": 0.7093912959098816, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4085.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6908167004585266, \"percentile_inc_nulls\": 0.6911818385124207, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4194.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.670790433883667, \"percentile_inc_nulls\": 0.6711792349815369, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4607.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6495774984359741, \"percentile_inc_nulls\": 0.6499912738800049, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4880.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6246044635772705, \"percentile_inc_nulls\": 0.6250478029251099, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5745.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5996531248092651, \"percentile_inc_nulls\": 0.6001259088516235, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5740.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5716241598129272, \"percentile_inc_nulls\": 0.5721300840377808, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6448.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5411088466644287, \"percentile_inc_nulls\": 0.5416507720947266, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7020.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5080678462982178, \"percentile_inc_nulls\": 0.5086488723754883, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7601.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.4703800678253174, \"percentile_inc_nulls\": 0.47100555896759033, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8670.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.42902785539627075, \"percentile_inc_nulls\": 0.42970216274261475, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9513.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.38677579164505005, \"percentile_inc_nulls\": 0.38749998807907104, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9720.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.3392161726951599, \"percentile_inc_nulls\": 0.3399965167045593, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10941.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.2879660129547119, \"percentile_inc_nulls\": 0.28880685567855835, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11790.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.23104310035705566, \"percentile_inc_nulls\": 0.23195117712020874, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 13095.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.16950809955596924, \"percentile_inc_nulls\": 0.1704888939857483, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14156.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.10641694068908691, \"percentile_inc_nulls\": 0.1074722409248352, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14514.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.04255199432373047, \"percentile_inc_nulls\": 0.04368269443511963, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14692.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0011809468269348145, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9789.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 906, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 906.0, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 272 values (0.1%) are null and there are 40089 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 906, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 744, \"group_name\": \"_street_address_\", \"value\": \"301 south college street\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 613, \"group_name\": \"_street_address_\", \"value\": \"388 greenwich st\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 586, \"group_name\": \"_street_address_\", \"value\": \"711 high street\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 473, \"group_name\": \"_street_address_\", \"value\": \"world financial center\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 467, \"group_name\": \"_street_address_\", \"value\": \"c/o state street bank & trust co\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 435, \"group_name\": \"_street_address_\", \"value\": \"lehman abs corp\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 429, \"group_name\": \"_street_address_\", \"value\": \"one international place\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 407, \"group_name\": \"_street_address_\", \"value\": \"383 madison avenue\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"8943 fullbright ave\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"400 e vine st ste 300\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"8000 maryland ave ste 1190\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"one international place ste 520\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"6855 south red road ste 400\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 906]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.99008709192276, \"percentile_inc_nulls\": 0.9901137351989746, \"value_count\": 2277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2277.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9816457033157349, \"percentile_inc_nulls\": 0.9816950559616089, \"value_count\": 1939, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1939.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.973717987537384, \"percentile_inc_nulls\": 0.9737886190414429, \"value_count\": 1821, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1821.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9667393565177917, \"percentile_inc_nulls\": 0.9668287634849548, \"value_count\": 1603, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1603.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9604964852333069, \"percentile_inc_nulls\": 0.9606026411056519, \"value_count\": 1434, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1434.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9545148015022278, \"percentile_inc_nulls\": 0.9546370506286621, \"value_count\": 1374, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1374.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9485548734664917, \"percentile_inc_nulls\": 0.9486930966377258, \"value_count\": 1369, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1369.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9427255392074585, \"percentile_inc_nulls\": 0.9428794980049133, \"value_count\": 1339, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1339.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.937832236289978, \"percentile_inc_nulls\": 0.9379993081092834, \"value_count\": 1124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1124.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.933365523815155, \"percentile_inc_nulls\": 0.9335446357727051, \"value_count\": 1026, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1026.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9293820858001709, \"percentile_inc_nulls\": 0.9295718669891357, \"value_count\": 915, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 915.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9254465699195862, \"percentile_inc_nulls\": 0.9256469011306763, \"value_count\": 904, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 904.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9215763211250305, \"percentile_inc_nulls\": 0.9217870831489563, \"value_count\": 889, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 889.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9181152582168579, \"percentile_inc_nulls\": 0.9183353781700134, \"value_count\": 795, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 795.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9117678999900818, \"percentile_inc_nulls\": 0.9120050668716431, \"value_count\": 729, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1458.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.908716082572937, \"percentile_inc_nulls\": 0.9089614152908325, \"value_count\": 701, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 701.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9057644605636597, \"percentile_inc_nulls\": 0.9060177206993103, \"value_count\": 678, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 678.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9029172658920288, \"percentile_inc_nulls\": 0.9031782150268555, \"value_count\": 654, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 654.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9000918865203857, \"percentile_inc_nulls\": 0.9003603458404541, \"value_count\": 649, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8973099589347839, \"percentile_inc_nulls\": 0.8975859880447388, \"value_count\": 639, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 639.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8918071985244751, \"percentile_inc_nulls\": 0.8920979499816895, \"value_count\": 632, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1264.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8890731930732727, \"percentile_inc_nulls\": 0.8893713355064392, \"value_count\": 628, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 628.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8864045143127441, \"percentile_inc_nulls\": 0.8867098093032837, \"value_count\": 613, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8837401866912842, \"percentile_inc_nulls\": 0.8840526342391968, \"value_count\": 612, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 612.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.88108891248703, \"percentile_inc_nulls\": 0.8814084529876709, \"value_count\": 609, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 609.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8786247968673706, \"percentile_inc_nulls\": 0.8789510130882263, \"value_count\": 566, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 566.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.876182496547699, \"percentile_inc_nulls\": 0.87651526927948, \"value_count\": 561, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 561.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8737488985061646, \"percentile_inc_nulls\": 0.8740882277488708, \"value_count\": 559, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 559.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.87142413854599, \"percentile_inc_nulls\": 0.8717697262763977, \"value_count\": 534, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 534.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8691777586936951, \"percentile_inc_nulls\": 0.8695293664932251, \"value_count\": 516, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8669357299804688, \"percentile_inc_nulls\": 0.8672933578491211, \"value_count\": 515, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 515.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.864698052406311, \"percentile_inc_nulls\": 0.8650616407394409, \"value_count\": 514, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 514.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8624733686447144, \"percentile_inc_nulls\": 0.8628430366516113, \"value_count\": 511, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8580589294433594, \"percentile_inc_nulls\": 0.8584403991699219, \"value_count\": 507, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1014.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8558691740036011, \"percentile_inc_nulls\": 0.8562564849853516, \"value_count\": 503, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 503.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.853775143623352, \"percentile_inc_nulls\": 0.8541681170463562, \"value_count\": 481, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 481.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8516898155212402, \"percentile_inc_nulls\": 0.8520883917808533, \"value_count\": 479, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 479.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8496524095535278, \"percentile_inc_nulls\": 0.8500564098358154, \"value_count\": 468, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8476279973983765, \"percentile_inc_nulls\": 0.8480374813079834, \"value_count\": 465, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8456079959869385, \"percentile_inc_nulls\": 0.84602290391922, \"value_count\": 464, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8436489105224609, \"percentile_inc_nulls\": 0.8440691232681274, \"value_count\": 450, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8398004174232483, \"percentile_inc_nulls\": 0.8402310013771057, \"value_count\": 442, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 884.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8378892540931702, \"percentile_inc_nulls\": 0.8383249044418335, \"value_count\": 439, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 439.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8360172510147095, \"percentile_inc_nulls\": 0.8364579677581787, \"value_count\": 430, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8341495990753174, \"percentile_inc_nulls\": 0.8345953226089478, \"value_count\": 429, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8322862982749939, \"percentile_inc_nulls\": 0.8327370882034302, \"value_count\": 428, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8304448127746582, \"percentile_inc_nulls\": 0.8309004902839661, \"value_count\": 423, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8286119699478149, \"percentile_inc_nulls\": 0.8290725946426392, \"value_count\": 421, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 421.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8267922401428223, \"percentile_inc_nulls\": 0.8272577524185181, \"value_count\": 418, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8249855041503906, \"percentile_inc_nulls\": 0.825455904006958, \"value_count\": 415, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 415.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.823187530040741, \"percentile_inc_nulls\": 0.8236627578735352, \"value_count\": 413, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 413.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8213939070701599, \"percentile_inc_nulls\": 0.8218739032745361, \"value_count\": 412, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8178240060806274, \"percentile_inc_nulls\": 0.8183136582374573, \"value_count\": 410, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.816056489944458, \"percentile_inc_nulls\": 0.8165508508682251, \"value_count\": 406, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.812617301940918, \"percentile_inc_nulls\": 0.8131208419799805, \"value_count\": 395, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 790.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8109150528907776, \"percentile_inc_nulls\": 0.8114232420921326, \"value_count\": 391, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 391.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8092389702796936, \"percentile_inc_nulls\": 0.8097516298294067, \"value_count\": 385, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8059042096138, \"percentile_inc_nulls\": 0.8064258098602295, \"value_count\": 383, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 766.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8025955557823181, \"percentile_inc_nulls\": 0.8031260967254639, \"value_count\": 380, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8009586334228516, \"percentile_inc_nulls\": 0.8014935851097107, \"value_count\": 376, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7993347644805908, \"percentile_inc_nulls\": 0.7998740673065186, \"value_count\": 373, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 373.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7977153062820435, \"percentile_inc_nulls\": 0.7982589602470398, \"value_count\": 372, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7961306571960449, \"percentile_inc_nulls\": 0.7966785430908203, \"value_count\": 364, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7945720553398132, \"percentile_inc_nulls\": 0.7951241731643677, \"value_count\": 358, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 358.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7930309772491455, \"percentile_inc_nulls\": 0.7935872077941895, \"value_count\": 354, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7915289998054504, \"percentile_inc_nulls\": 0.7920892834663391, \"value_count\": 345, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7900357246398926, \"percentile_inc_nulls\": 0.790600061416626, \"value_count\": 343, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7885729670524597, \"percentile_inc_nulls\": 0.7891411781311035, \"value_count\": 336, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7871232628822327, \"percentile_inc_nulls\": 0.7876954078674316, \"value_count\": 333, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7856822609901428, \"percentile_inc_nulls\": 0.7862582206726074, \"value_count\": 331, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 331.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7842717170715332, \"percentile_inc_nulls\": 0.7848514914512634, \"value_count\": 324, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7828916907310486, \"percentile_inc_nulls\": 0.7834751605987549, \"value_count\": 317, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 317.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7801402807235718, \"percentile_inc_nulls\": 0.7807311415672302, \"value_count\": 316, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 632.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7787688970565796, \"percentile_inc_nulls\": 0.7793635129928589, \"value_count\": 315, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7774062752723694, \"percentile_inc_nulls\": 0.77800452709198, \"value_count\": 313, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7760480046272278, \"percentile_inc_nulls\": 0.7766498923301697, \"value_count\": 312, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.774702787399292, \"percentile_inc_nulls\": 0.7753082513809204, \"value_count\": 309, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7733619213104248, \"percentile_inc_nulls\": 0.7739709615707397, \"value_count\": 308, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7720253467559814, \"percentile_inc_nulls\": 0.7726380825042725, \"value_count\": 307, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 307.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7706975936889648, \"percentile_inc_nulls\": 0.7713138461112976, \"value_count\": 305, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 305.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7681115865707397, \"percentile_inc_nulls\": 0.7687348127365112, \"value_count\": 297, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7668229341506958, \"percentile_inc_nulls\": 0.7674496173858643, \"value_count\": 296, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7642543911933899, \"percentile_inc_nulls\": 0.7648879885673523, \"value_count\": 295, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7629787921905518, \"percentile_inc_nulls\": 0.7636158466339111, \"value_count\": 293, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 293.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7617163062095642, \"percentile_inc_nulls\": 0.7623567581176758, \"value_count\": 290, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.759269654750824, \"percentile_inc_nulls\": 0.7599166631698608, \"value_count\": 281, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.756831705570221, \"percentile_inc_nulls\": 0.7574852705001831, \"value_count\": 280, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7556257843971252, \"percentile_inc_nulls\": 0.7562825679779053, \"value_count\": 277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7544373273849487, \"percentile_inc_nulls\": 0.7550972700119019, \"value_count\": 273, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.753253161907196, \"percentile_inc_nulls\": 0.7539162635803223, \"value_count\": 272, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7497268319129944, \"percentile_inc_nulls\": 0.7503994703292847, \"value_count\": 270, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.747410774230957, \"percentile_inc_nulls\": 0.748089611530304, \"value_count\": 266, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7462570667266846, \"percentile_inc_nulls\": 0.7469390630722046, \"value_count\": 265, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7451338768005371, \"percentile_inc_nulls\": 0.7458188533782959, \"value_count\": 258, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7440193891525269, \"percentile_inc_nulls\": 0.7447073459625244, \"value_count\": 256, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7429092526435852, \"percentile_inc_nulls\": 0.7436002492904663, \"value_count\": 255, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7418078184127808, \"percentile_inc_nulls\": 0.7425017356872559, \"value_count\": 253, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 253.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7407151460647583, \"percentile_inc_nulls\": 0.7414119243621826, \"value_count\": 251, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7396267652511597, \"percentile_inc_nulls\": 0.7403265237808228, \"value_count\": 250, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.737476110458374, \"percentile_inc_nulls\": 0.7381816506385803, \"value_count\": 247, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7353428602218628, \"percentile_inc_nulls\": 0.7360541820526123, \"value_count\": 245, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7332184314727783, \"percentile_inc_nulls\": 0.7339353561401367, \"value_count\": 244, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7321605086326599, \"percentile_inc_nulls\": 0.7328803539276123, \"value_count\": 243, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7311112880706787, \"percentile_inc_nulls\": 0.7318339347839355, \"value_count\": 241, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 241.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7300664782524109, \"percentile_inc_nulls\": 0.7307919263839722, \"value_count\": 240, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7279855012893677, \"percentile_inc_nulls\": 0.7287166118621826, \"value_count\": 239, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 478.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7269493937492371, \"percentile_inc_nulls\": 0.7276831865310669, \"value_count\": 238, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7259175777435303, \"percentile_inc_nulls\": 0.7266542315483093, \"value_count\": 237, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7248901724815369, \"percentile_inc_nulls\": 0.7256295680999756, \"value_count\": 236, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7238671183586121, \"percentile_inc_nulls\": 0.7246092557907104, \"value_count\": 235, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7218557596206665, \"percentile_inc_nulls\": 0.7226033210754395, \"value_count\": 231, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7198618650436401, \"percentile_inc_nulls\": 0.7206147909164429, \"value_count\": 229, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 458.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7188736796379089, \"percentile_inc_nulls\": 0.719629168510437, \"value_count\": 227, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 227.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7178941369056702, \"percentile_inc_nulls\": 0.7186523079872131, \"value_count\": 225, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7169276475906372, \"percentile_inc_nulls\": 0.7176884412765503, \"value_count\": 222, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7159655094146729, \"percentile_inc_nulls\": 0.716728925704956, \"value_count\": 221, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7150077819824219, \"percentile_inc_nulls\": 0.7157737016677856, \"value_count\": 220, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7131009101867676, \"percentile_inc_nulls\": 0.7138720154762268, \"value_count\": 219, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 438.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.709304690361023, \"percentile_inc_nulls\": 0.7100859880447388, \"value_count\": 218, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 872.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.708368718624115, \"percentile_inc_nulls\": 0.7091524600982666, \"value_count\": 215, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7065054178237915, \"percentile_inc_nulls\": 0.707294225692749, \"value_count\": 214, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.705578088760376, \"percentile_inc_nulls\": 0.7063694000244141, \"value_count\": 213, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7028093338012695, \"percentile_inc_nulls\": 0.7036080360412598, \"value_count\": 212, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7009808421134949, \"percentile_inc_nulls\": 0.7017844915390015, \"value_count\": 210, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7000709772109985, \"percentile_inc_nulls\": 0.7008770704269409, \"value_count\": 209, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6982599496841431, \"percentile_inc_nulls\": 0.6990708708763123, \"value_count\": 208, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6973587274551392, \"percentile_inc_nulls\": 0.6981720924377441, \"value_count\": 207, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6955651044845581, \"percentile_inc_nulls\": 0.6963832974433899, \"value_count\": 206, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6937975883483887, \"percentile_inc_nulls\": 0.6946204900741577, \"value_count\": 203, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6929181814193726, \"percentile_inc_nulls\": 0.6937434673309326, \"value_count\": 202, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.692043125629425, \"percentile_inc_nulls\": 0.6928707957267761, \"value_count\": 201, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6911724805831909, \"percentile_inc_nulls\": 0.6920024156570435, \"value_count\": 200, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6894571781158447, \"percentile_inc_nulls\": 0.6902917623519897, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 394.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6877505779266357, \"percentile_inc_nulls\": 0.6885898113250732, \"value_count\": 196, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6869059801101685, \"percentile_inc_nulls\": 0.6877474784851074, \"value_count\": 194, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6860701441764832, \"percentile_inc_nulls\": 0.6869138479232788, \"value_count\": 192, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6835756301879883, \"percentile_inc_nulls\": 0.6844260096549988, \"value_count\": 191, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 573.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.682752788066864, \"percentile_inc_nulls\": 0.683605432510376, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6819343566894531, \"percentile_inc_nulls\": 0.6827892065048218, \"value_count\": 188, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6794919967651367, \"percentile_inc_nulls\": 0.6803534030914307, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 561.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.676253080368042, \"percentile_inc_nulls\": 0.6771231293678284, \"value_count\": 186, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6754476428031921, \"percentile_inc_nulls\": 0.6763198971748352, \"value_count\": 185, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6746509671211243, \"percentile_inc_nulls\": 0.6755253672599792, \"value_count\": 183, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.673858642578125, \"percentile_inc_nulls\": 0.6747351884841919, \"value_count\": 182, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6722826957702637, \"percentile_inc_nulls\": 0.6731634140014648, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6715034246444702, \"percentile_inc_nulls\": 0.6723862886428833, \"value_count\": 179, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 179.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6699622869491577, \"percentile_inc_nulls\": 0.6708492636680603, \"value_count\": 177, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6684298515319824, \"percentile_inc_nulls\": 0.6693209409713745, \"value_count\": 176, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6661442518234253, \"percentile_inc_nulls\": 0.667041540145874, \"value_count\": 175, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6646292209625244, \"percentile_inc_nulls\": 0.6655305624008179, \"value_count\": 174, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6638760566711426, \"percentile_inc_nulls\": 0.6647794246673584, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6623871922492981, \"percentile_inc_nulls\": 0.6632945537567139, \"value_count\": 171, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6586867570877075, \"percentile_inc_nulls\": 0.6596040725708008, \"value_count\": 170, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 850.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6572152376174927, \"percentile_inc_nulls\": 0.6581364870071411, \"value_count\": 169, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6564838886260986, \"percentile_inc_nulls\": 0.6574070453643799, \"value_count\": 168, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6543027758598328, \"percentile_inc_nulls\": 0.6552318334579468, \"value_count\": 167, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 501.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6528661251068115, \"percentile_inc_nulls\": 0.6537990570068359, \"value_count\": 165, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6514555811882019, \"percentile_inc_nulls\": 0.6523923277854919, \"value_count\": 162, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6507546901702881, \"percentile_inc_nulls\": 0.6516932845115662, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6493615508079529, \"percentile_inc_nulls\": 0.6503039598464966, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6479771137237549, \"percentile_inc_nulls\": 0.6489232778549194, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6472892761230469, \"percentile_inc_nulls\": 0.6482372283935547, \"value_count\": 158, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6452388167381287, \"percentile_inc_nulls\": 0.6461922526359558, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6445596814155579, \"percentile_inc_nulls\": 0.645514965057373, \"value_count\": 156, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6425353288650513, \"percentile_inc_nulls\": 0.643496036529541, \"value_count\": 155, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6405240297317505, \"percentile_inc_nulls\": 0.64149010181427, \"value_count\": 154, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6392005681991577, \"percentile_inc_nulls\": 0.6401702165603638, \"value_count\": 152, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6385431289672852, \"percentile_inc_nulls\": 0.6395145654678345, \"value_count\": 151, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6365840435028076, \"percentile_inc_nulls\": 0.6375607848167419, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6352954506874084, \"percentile_inc_nulls\": 0.6362756490707397, \"value_count\": 148, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6340242624282837, \"percentile_inc_nulls\": 0.6350078582763672, \"value_count\": 146, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 292.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.63213050365448, \"percentile_inc_nulls\": 0.6331191062927246, \"value_count\": 145, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6315035820007324, \"percentile_inc_nulls\": 0.6324939131736755, \"value_count\": 144, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6302584409713745, \"percentile_inc_nulls\": 0.6312521696090698, \"value_count\": 143, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6290220618247986, \"percentile_inc_nulls\": 0.6300190687179565, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6271805763244629, \"percentile_inc_nulls\": 0.6281825304031372, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6253520846366882, \"percentile_inc_nulls\": 0.6263589859008789, \"value_count\": 140, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.62354975938797, \"percentile_inc_nulls\": 0.6245614886283875, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6223568916320801, \"percentile_inc_nulls\": 0.6233718395233154, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6193965077400208, \"percentile_inc_nulls\": 0.6204193830490112, \"value_count\": 136, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6170456409454346, \"percentile_inc_nulls\": 0.618074893951416, \"value_count\": 135, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.616462230682373, \"percentile_inc_nulls\": 0.6174930334091187, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6141462326049805, \"percentile_inc_nulls\": 0.6151832342147827, \"value_count\": 133, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6135715246200562, \"percentile_inc_nulls\": 0.6146100759506226, \"value_count\": 132, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6130012273788452, \"percentile_inc_nulls\": 0.6140413284301758, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6124353408813477, \"percentile_inc_nulls\": 0.6134768724441528, \"value_count\": 130, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6113120913505554, \"percentile_inc_nulls\": 0.6123567223548889, \"value_count\": 129, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6090831160545349, \"percentile_inc_nulls\": 0.6101337671279907, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 512.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6068715453147888, \"percentile_inc_nulls\": 0.6079280972480774, \"value_count\": 127, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 508.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.605774462223053, \"percentile_inc_nulls\": 0.6068339347839355, \"value_count\": 126, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6046860814094543, \"percentile_inc_nulls\": 0.6057485342025757, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6036064624786377, \"percentile_inc_nulls\": 0.6046717166900635, \"value_count\": 124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6019999980926514, \"percentile_inc_nulls\": 0.6030696630477905, \"value_count\": 123, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 369.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5988132357597351, \"percentile_inc_nulls\": 0.5998914241790771, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 732.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5977597236633301, \"percentile_inc_nulls\": 0.5988407135009766, \"value_count\": 121, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.596192479133606, \"percentile_inc_nulls\": 0.5972777009010315, \"value_count\": 120, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5941201448440552, \"percentile_inc_nulls\": 0.5952110290527344, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 476.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5936064720153809, \"percentile_inc_nulls\": 0.5946986675262451, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5915690660476685, \"percentile_inc_nulls\": 0.592666745185852, \"value_count\": 117, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5895664691925049, \"percentile_inc_nulls\": 0.5906695127487183, \"value_count\": 115, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5875812768936157, \"percentile_inc_nulls\": 0.5886896848678589, \"value_count\": 114, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 456.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.585613489151001, \"percentile_inc_nulls\": 0.5867271423339844, \"value_count\": 113, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5841506719589233, \"percentile_inc_nulls\": 0.5852683186531067, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5817345380783081, \"percentile_inc_nulls\": 0.5828586220741272, \"value_count\": 111, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 555.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5788612365722656, \"percentile_inc_nulls\": 0.5799930095672607, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 660.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5764885544776917, \"percentile_inc_nulls\": 0.5776268243789673, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 545.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5746078491210938, \"percentile_inc_nulls\": 0.5757511258125305, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 432.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5722787380218506, \"percentile_inc_nulls\": 0.5734282732009888, \"value_count\": 107, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 535.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5704329013824463, \"percentile_inc_nulls\": 0.571587324142456, \"value_count\": 106, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5681473016738892, \"percentile_inc_nulls\": 0.5693079233169556, \"value_count\": 105, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5663362741470337, \"percentile_inc_nulls\": 0.5675017237663269, \"value_count\": 104, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5645425915718079, \"percentile_inc_nulls\": 0.5657129287719727, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5623223185539246, \"percentile_inc_nulls\": 0.5634986162185669, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5601238012313843, \"percentile_inc_nulls\": 0.5613059997558594, \"value_count\": 101, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 505.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5588177442550659, \"percentile_inc_nulls\": 0.5600034594535828, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5570937991142273, \"percentile_inc_nulls\": 0.5582841634750366, \"value_count\": 99, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5549606084823608, \"percentile_inc_nulls\": 0.5561566352844238, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.552426815032959, \"percentile_inc_nulls\": 0.5536297559738159, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5490833520889282, \"percentile_inc_nulls\": 0.5502952337265015, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 768.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5466018915176392, \"percentile_inc_nulls\": 0.5478204488754272, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5441465377807617, \"percentile_inc_nulls\": 0.5453716516494751, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5425270199775696, \"percentile_inc_nulls\": 0.5437564849853516, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.540524423122406, \"percentile_inc_nulls\": 0.5417592525482178, \"value_count\": 92, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5385435819625854, \"percentile_inc_nulls\": 0.539783775806427, \"value_count\": 91, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5361927151679993, \"percentile_inc_nulls\": 0.537439227104187, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5338679552078247, \"percentile_inc_nulls\": 0.5351207256317139, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 534.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5315693020820618, \"percentile_inc_nulls\": 0.5328282117843628, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5300543308258057, \"percentile_inc_nulls\": 0.5313172936439514, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5289310812950134, \"percentile_inc_nulls\": 0.5301971435546875, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5248606204986572, \"percentile_inc_nulls\": 0.5261375904083252, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 935.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5215693712234497, \"percentile_inc_nulls\": 0.5228551626205444, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5186786651611328, \"percentile_inc_nulls\": 0.5199722051620483, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 664.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5158227682113647, \"percentile_inc_nulls\": 0.5171239972114563, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 656.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5130016803741455, \"percentile_inc_nulls\": 0.5143105387687683, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 648.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5102154612541199, \"percentile_inc_nulls\": 0.5115317702293396, \"value_count\": 80, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5074640512466431, \"percentile_inc_nulls\": 0.5087877511978149, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 632.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5047475099563599, \"percentile_inc_nulls\": 0.5060784816741943, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5027361512184143, \"percentile_inc_nulls\": 0.5040726065635681, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4990966320037842, \"percentile_inc_nulls\": 0.5004428625106812, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 836.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.49583154916763306, \"percentile_inc_nulls\": 0.497186541557312, \"value_count\": 75, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 750.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.492287814617157, \"percentile_inc_nulls\": 0.4936522841453552, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4897453784942627, \"percentile_inc_nulls\": 0.4911167025566101, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48723775148391724, \"percentile_inc_nulls\": 0.48861581087112427, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48414677381515503, \"percentile_inc_nulls\": 0.4855331778526306, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 710.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48353731632232666, \"percentile_inc_nulls\": 0.4849253296852112, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48233574628829956, \"percentile_inc_nulls\": 0.48372697830200195, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47937536239624023, \"percentile_inc_nulls\": 0.48077458143234253, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47733360528945923, \"percentile_inc_nulls\": 0.4787382483482361, \"value_count\": 67, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 469.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47474759817123413, \"percentile_inc_nulls\": 0.4761592745780945, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47304975986480713, \"percentile_inc_nulls\": 0.4744659662246704, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.46914905309677124, \"percentile_inc_nulls\": 0.470575749874115, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 896.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4655835032463074, \"percentile_inc_nulls\": 0.46701979637145996, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 819.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4620746374130249, \"percentile_inc_nulls\": 0.463520348072052, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 806.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45782560110092163, \"percentile_inc_nulls\": 0.4592827558517456, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 976.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45390748977661133, \"percentile_inc_nulls\": 0.45537513494491577, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45185261964797974, \"percentile_inc_nulls\": 0.4533258080482483, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.44831758737564087, \"percentile_inc_nulls\": 0.4498002529144287, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 812.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4448435306549072, \"percentile_inc_nulls\": 0.4463355541229248, \"value_count\": 57, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4411866068840027, \"percentile_inc_nulls\": 0.4426884055137634, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.43807387351989746, \"percentile_inc_nulls\": 0.43958407640457153, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 715.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4350177049636841, \"percentile_inc_nulls\": 0.43653613328933716, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.43086445331573486, \"percentile_inc_nulls\": 0.4323940873146057, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 954.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4249785542488098, \"percentile_inc_nulls\": 0.42652398347854614, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4220921993255615, \"percentile_inc_nulls\": 0.42364537715911865, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4181740880012512, \"percentile_inc_nulls\": 0.4197377562522888, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4136943221092224, \"percentile_inc_nulls\": 0.41527003049850464, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1029.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.41055983304977417, \"percentile_inc_nulls\": 0.4121439456939697, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.40605831146240234, \"percentile_inc_nulls\": 0.4076545834541321, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1034.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4018528461456299, \"percentile_inc_nulls\": 0.40346038341522217, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 966.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.39773881435394287, \"percentile_inc_nulls\": 0.3993574380874634, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 945.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.39390772581100464, \"percentile_inc_nulls\": 0.3955366611480713, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3896021246910095, \"percentile_inc_nulls\": 0.3912426233291626, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 989.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3866766095161438, \"percentile_inc_nulls\": 0.38832491636276245, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 672.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.38096481561660767, \"percentile_inc_nulls\": 0.38262850046157837, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1312.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3760889172554016, \"percentile_inc_nulls\": 0.3777657151222229, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1120.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3715047240257263, \"percentile_inc_nulls\": 0.37319380044937134, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1053.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.36488741636276245, \"percentile_inc_nulls\": 0.3665943145751953, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1520.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3584442138671875, \"percentile_inc_nulls\": 0.36016845703125, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1480.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.35389918088912964, \"percentile_inc_nulls\": 0.3556356430053711, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1044.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.35039466619491577, \"percentile_inc_nulls\": 0.35214048624038696, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 805.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.34521400928497314, \"percentile_inc_nulls\": 0.3469737768173218, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1190.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3393237590789795, \"percentile_inc_nulls\": 0.3410993218421936, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1353.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.33291542530059814, \"percentile_inc_nulls\": 0.33470821380615234, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1472.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3269772529602051, \"percentile_inc_nulls\": 0.3287860155105591, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1364.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3180961608886719, \"percentile_inc_nulls\": 0.31992876529693604, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2040.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.30850106477737427, \"percentile_inc_nulls\": 0.31035947799682617, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2204.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2969207763671875, \"percentile_inc_nulls\": 0.29881036281585693, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2660.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2848137617111206, \"percentile_inc_nulls\": 0.2867358326911926, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2781.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.276550829410553, \"percentile_inc_nulls\": 0.2784951329231262, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1898.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2687145471572876, \"percentile_inc_nulls\": 0.27067995071411133, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.25753480195999146, \"percentile_inc_nulls\": 0.259530246257782, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2568.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.24732154607772827, \"percentile_inc_nulls\": 0.24934440851211548, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2346.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.23803120851516724, \"percentile_inc_nulls\": 0.24007904529571533, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2134.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.22779178619384766, \"percentile_inc_nulls\": 0.22986716032028198, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.21899771690368652, \"percentile_inc_nulls\": 0.22109675407409668, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2020.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.20865821838378906, \"percentile_inc_nulls\": 0.21078497171401978, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2375.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.19792252779006958, \"percentile_inc_nulls\": 0.20007812976837158, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2466.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.18689513206481934, \"percentile_inc_nulls\": 0.18908041715621948, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2533.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.1774219274520874, \"percentile_inc_nulls\": 0.17963266372680664, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2176.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.16514509916305542, \"percentile_inc_nulls\": 0.16738885641098022, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2820.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.15679514408111572, \"percentile_inc_nulls\": 0.15906131267547607, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1918.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.1475135087966919, \"percentile_inc_nulls\": 0.14980459213256836, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2132.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.13732635974884033, \"percentile_inc_nulls\": 0.13964486122131348, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2340.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.12607258558273315, \"percentile_inc_nulls\": 0.12842130661010742, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2585.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.11405694484710693, \"percentile_inc_nulls\": 0.11643797159194946, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2760.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.10151892900466919, \"percentile_inc_nulls\": 0.10393363237380981, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2880.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.08891123533248901, \"percentile_inc_nulls\": 0.09135985374450684, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2896.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.07592916488647461, \"percentile_inc_nulls\": 0.07841265201568604, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2982.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.06292092800140381, \"percentile_inc_nulls\": 0.06543940305709839, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2988.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.049229204654693604, \"percentile_inc_nulls\": 0.0517844557762146, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3145.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.034880101680755615, \"percentile_inc_nulls\": 0.037473976612091064, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.02078789472579956, \"percentile_inc_nulls\": 0.023419618606567383, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3237.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.008188903331756592, \"percentile_inc_nulls\": 0.010854482650756836, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2894.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026875734329223633, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1881.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2277.0, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 619 values (0.3%) are null and there are 11612 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2277, \"group_name\": \"_zip_code_\", \"value\": \"10022\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1939, \"group_name\": \"_zip_code_\", \"value\": \"10036\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1821, \"group_name\": \"_zip_code_\", \"value\": \"10019\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1603, \"group_name\": \"_zip_code_\", \"value\": \"00000\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1434, \"group_name\": \"_zip_code_\", \"value\": \"02110\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1374, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1369, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1339, \"group_name\": \"_zip_code_\", \"value\": \"10013\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1124, \"group_name\": \"_zip_code_\", \"value\": \"92121\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1026, \"group_name\": \"_zip_code_\", \"value\": \"91302\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"55446-0106\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"75140\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"79550\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"06410\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"98327\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2277]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 210,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 211,
+   "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed details,\n",
+       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-6022312cfc10441087e40cd1a0af0abb\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-6022312cfc10441087e40cd1a0af0abb\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-6022312cfc10441087e40cd1a0af0abb\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9992628693580627, \"percentile_inc_nulls\": 0.9992628693580627, \"value_count\": 130, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9986108541488647, \"percentile_inc_nulls\": 0.9986108541488647, \"value_count\": 115, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9979758262634277, \"percentile_inc_nulls\": 0.9979758262634277, \"value_count\": 112, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.997369110584259, \"percentile_inc_nulls\": 0.997369110584259, \"value_count\": 107, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9968814849853516, \"percentile_inc_nulls\": 0.9968814849853516, \"value_count\": 86, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9963995218276978, \"percentile_inc_nulls\": 0.9963995218276978, \"value_count\": 85, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9959743022918701, \"percentile_inc_nulls\": 0.9959743022918701, \"value_count\": 75, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9955546855926514, \"percentile_inc_nulls\": 0.9955546855926514, \"value_count\": 74, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9951464533805847, \"percentile_inc_nulls\": 0.9951464533805847, \"value_count\": 72, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.994755208492279, \"percentile_inc_nulls\": 0.994755208492279, \"value_count\": 69, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9943753480911255, \"percentile_inc_nulls\": 0.9943753480911255, \"value_count\": 67, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9940010905265808, \"percentile_inc_nulls\": 0.9940010905265808, \"value_count\": 66, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9932640194892883, \"percentile_inc_nulls\": 0.9932640194892883, \"value_count\": 65, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9929068088531494, \"percentile_inc_nulls\": 0.9929068088531494, \"value_count\": 63, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9922150373458862, \"percentile_inc_nulls\": 0.9922150373458862, \"value_count\": 61, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9918975234031677, \"percentile_inc_nulls\": 0.9918975234031677, \"value_count\": 56, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9909619688987732, \"percentile_inc_nulls\": 0.9909619688987732, \"value_count\": 55, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.990655779838562, \"percentile_inc_nulls\": 0.990655779838562, \"value_count\": 54, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9900547862052917, \"percentile_inc_nulls\": 0.9900547862052917, \"value_count\": 53, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9897599220275879, \"percentile_inc_nulls\": 0.9897599220275879, \"value_count\": 52, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9894764423370361, \"percentile_inc_nulls\": 0.9894764423370361, \"value_count\": 50, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.989198625087738, \"percentile_inc_nulls\": 0.989198625087738, \"value_count\": 49, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9856604933738708, \"percentile_inc_nulls\": 0.9856604933738708, \"value_count\": 48, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9837950468063354, \"percentile_inc_nulls\": 0.9837950468063354, \"value_count\": 47, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 329.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9827517867088318, \"percentile_inc_nulls\": 0.9827517867088318, \"value_count\": 46, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9817311763763428, \"percentile_inc_nulls\": 0.9817311763763428, \"value_count\": 45, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.981481671333313, \"percentile_inc_nulls\": 0.981481671333313, \"value_count\": 44, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9809940457344055, \"percentile_inc_nulls\": 0.9809940457344055, \"value_count\": 43, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9805178046226501, \"percentile_inc_nulls\": 0.9805178046226501, \"value_count\": 42, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 41, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9791343212127686, \"percentile_inc_nulls\": 0.9791343212127686, \"value_count\": 40, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9786920547485352, \"percentile_inc_nulls\": 0.9786920547485352, \"value_count\": 39, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9761065244674683, \"percentile_inc_nulls\": 0.9761065244674683, \"value_count\": 38, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 456.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9750575423240662, \"percentile_inc_nulls\": 0.9750575423240662, \"value_count\": 37, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9746493101119995, \"percentile_inc_nulls\": 0.9746493101119995, \"value_count\": 36, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9728632569313049, \"percentile_inc_nulls\": 0.9728632569313049, \"value_count\": 35, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9717065691947937, \"percentile_inc_nulls\": 0.9717065691947937, \"value_count\": 34, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9700225591659546, \"percentile_inc_nulls\": 0.9700225591659546, \"value_count\": 33, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9682081341743469, \"percentile_inc_nulls\": 0.9682081341743469, \"value_count\": 32, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9597711563110352, \"percentile_inc_nulls\": 0.9597711563110352, \"value_count\": 31, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1488.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9567093253135681, \"percentile_inc_nulls\": 0.9567093253135681, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9550650119781494, \"percentile_inc_nulls\": 0.9550650119781494, \"value_count\": 29, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9520485997200012, \"percentile_inc_nulls\": 0.9520485997200012, \"value_count\": 28, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9500584006309509, \"percentile_inc_nulls\": 0.9500584006309509, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.948879063129425, \"percentile_inc_nulls\": 0.948879063129425, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9470362663269043, \"percentile_inc_nulls\": 0.9470362663269043, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.7761189937591553, \"percentile_inc_nulls\": 0.7761189937591553, \"value_count\": 24, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 30144.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.7446900606155396, \"percentile_inc_nulls\": 0.7446900606155396, \"value_count\": 23, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5543.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.692049503326416, \"percentile_inc_nulls\": 0.692049503326416, \"value_count\": 22, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 9284.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6740698218345642, \"percentile_inc_nulls\": 0.6740698218345642, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3171.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6567195653915405, \"percentile_inc_nulls\": 0.6567195653915405, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3060.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6309719681739807, \"percentile_inc_nulls\": 0.6309719681739807, \"value_count\": 19, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4541.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6099475026130676, \"percentile_inc_nulls\": 0.6099475026130676, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3708.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5923080444335938, \"percentile_inc_nulls\": 0.5923080444335938, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3111.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5730752944946289, \"percentile_inc_nulls\": 0.5730752944946289, \"value_count\": 16, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3392.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5208543539047241, \"percentile_inc_nulls\": 0.5208543539047241, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 9210.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.4969608783721924, \"percentile_inc_nulls\": 0.4969608783721924, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4214.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.4662973880767822, \"percentile_inc_nulls\": 0.4662973880767822, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5408.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.41703617572784424, \"percentile_inc_nulls\": 0.41703617572784424, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8688.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.37624597549438477, \"percentile_inc_nulls\": 0.37624597549438477, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7194.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.3343445062637329, \"percentile_inc_nulls\": 0.3343445062637329, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7390.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.2747921943664551, \"percentile_inc_nulls\": 0.2747921943664551, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 10503.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.23614531755447388, \"percentile_inc_nulls\": 0.23614531755447388, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6816.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.1329507827758789, \"percentile_inc_nulls\": 0.1329507827758789, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 18200.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.10240066051483154, \"percentile_inc_nulls\": 0.10240066051483154, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5388.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.060612618923187256, \"percentile_inc_nulls\": 0.060612618923187256, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7370.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.03430366516113281, \"percentile_inc_nulls\": 0.03430366516113281, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4640.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.015694618225097656, \"percentile_inc_nulls\": 0.015694618225097656, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3282.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.004479348659515381, \"percentile_inc_nulls\": 0.004479348659515381, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1978.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 790.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 130, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 17785 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 130, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 115, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 112, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 107, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 86, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 85, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 75, \"group_name\": \"_company_name_\", \"value\": \"tri county electric coop, incorporated\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 74, \"group_name\": \"_company_name_\", \"value\": \"stone container corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 72, \"group_name\": \"_company_name_\", \"value\": \"marshall city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 69, \"group_name\": \"_company_name_\", \"value\": \"burlington city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"reliability design and development limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"anole energy storage, limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"northumberland solar i, limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"ny cdg genesee 4 limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"eni new energy us, incorporated\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 130]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9043566584587097, \"percentile_inc_nulls\": 0.9151197075843811, \"value_count\": 14970, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 14970.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.821810781955719, \"percentile_inc_nulls\": 0.8418629169464111, \"value_count\": 12920, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12920.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7686542868614197, \"percentile_inc_nulls\": 0.79468834400177, \"value_count\": 8320, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8320.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7240015268325806, \"percentile_inc_nulls\": 0.7550604939460754, \"value_count\": 6989, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6989.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6845366954803467, \"percentile_inc_nulls\": 0.7200367450714111, \"value_count\": 6177, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6177.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6467840671539307, \"percentile_inc_nulls\": 0.6865325570106506, \"value_count\": 5909, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5909.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6090570688247681, \"percentile_inc_nulls\": 0.6530510187149048, \"value_count\": 5905, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5905.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5756553411483765, \"percentile_inc_nulls\": 0.6234081387519836, \"value_count\": 5228, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5228.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5448348522186279, \"percentile_inc_nulls\": 0.5960559248924255, \"value_count\": 4824, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4824.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5146595239639282, \"percentile_inc_nulls\": 0.5692763924598694, \"value_count\": 4723, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4723.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4875510334968567, \"percentile_inc_nulls\": 0.5452184677124023, \"value_count\": 4243, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4243.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.46147751808166504, \"percentile_inc_nulls\": 0.5220791101455688, \"value_count\": 4081, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4081.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4364837408065796, \"percentile_inc_nulls\": 0.49989795684814453, \"value_count\": 3912, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3912.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4123716354370117, \"percentile_inc_nulls\": 0.47849923372268677, \"value_count\": 3774, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3774.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3896331787109375, \"percentile_inc_nulls\": 0.45831960439682007, \"value_count\": 3559, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3559.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3688114285469055, \"percentile_inc_nulls\": 0.43984103202819824, \"value_count\": 3259, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3259.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.34828996658325195, \"percentile_inc_nulls\": 0.4216288924217224, \"value_count\": 3212, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3212.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.328139066696167, \"percentile_inc_nulls\": 0.4037455916404724, \"value_count\": 3154, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3154.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3089912533760071, \"percentile_inc_nulls\": 0.38675254583358765, \"value_count\": 2997, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2997.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2900094985961914, \"percentile_inc_nulls\": 0.3699069023132324, \"value_count\": 2971, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2971.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2711939215660095, \"percentile_inc_nulls\": 0.35320866107940674, \"value_count\": 2945, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2945.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.25474226474761963, \"percentile_inc_nulls\": 0.3386083245277405, \"value_count\": 2575, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2575.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2392297387123108, \"percentile_inc_nulls\": 0.3248414993286133, \"value_count\": 2428, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2428.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.22381949424743652, \"percentile_inc_nulls\": 0.31116539239883423, \"value_count\": 2412, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2412.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.20867115259170532, \"percentile_inc_nulls\": 0.297721803188324, \"value_count\": 2371, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2371.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.19549065828323364, \"percentile_inc_nulls\": 0.2860245108604431, \"value_count\": 2063, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2063.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.18243151903152466, \"percentile_inc_nulls\": 0.27443498373031616, \"value_count\": 2044, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2044.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.16952574253082275, \"percentile_inc_nulls\": 0.2629815340042114, \"value_count\": 2020, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2020.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.15754634141921997, \"percentile_inc_nulls\": 0.25235021114349365, \"value_count\": 1875, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1875.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.14563089609146118, \"percentile_inc_nulls\": 0.24177563190460205, \"value_count\": 1865, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1865.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.13393902778625488, \"percentile_inc_nulls\": 0.23139947652816772, \"value_count\": 1830, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1830.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.12323743104934692, \"percentile_inc_nulls\": 0.22190219163894653, \"value_count\": 1675, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1675.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.11320030689239502, \"percentile_inc_nulls\": 0.21299457550048828, \"value_count\": 1571, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1571.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.10325264930725098, \"percentile_inc_nulls\": 0.20416635274887085, \"value_count\": 1557, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1557.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.09345829486846924, \"percentile_inc_nulls\": 0.19547420740127563, \"value_count\": 1533, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1533.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.08405369520187378, \"percentile_inc_nulls\": 0.18712788820266724, \"value_count\": 1472, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1472.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.07524967193603516, \"percentile_inc_nulls\": 0.17931461334228516, \"value_count\": 1378, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1378.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.06659895181655884, \"percentile_inc_nulls\": 0.1716374158859253, \"value_count\": 1354, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1354.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.059660494327545166, \"percentile_inc_nulls\": 0.16547971963882446, \"value_count\": 1086, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1086.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.05314368009567261, \"percentile_inc_nulls\": 0.15969634056091309, \"value_count\": 1020, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1020.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.046812236309051514, \"percentile_inc_nulls\": 0.15407729148864746, \"value_count\": 991, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 991.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.040736258029937744, \"percentile_inc_nulls\": 0.14868509769439697, \"value_count\": 951, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 951.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.035171449184417725, \"percentile_inc_nulls\": 0.14374655485153198, \"value_count\": 871, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 871.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0299643874168396, \"percentile_inc_nulls\": 0.13912546634674072, \"value_count\": 815, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 815.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.025000154972076416, \"percentile_inc_nulls\": 0.1347198486328125, \"value_count\": 777, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 777.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.02038729190826416, \"percentile_inc_nulls\": 0.13062608242034912, \"value_count\": 722, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 722.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.016336679458618164, \"percentile_inc_nulls\": 0.12703126668930054, \"value_count\": 634, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 634.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.01256716251373291, \"percentile_inc_nulls\": 0.12368595600128174, \"value_count\": 590, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.009583473205566406, \"percentile_inc_nulls\": 0.12103807926177979, \"value_count\": 467, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 467.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.00688093900680542, \"percentile_inc_nulls\": 0.11863964796066284, \"value_count\": 423, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.004612863063812256, \"percentile_inc_nulls\": 0.1166267991065979, \"value_count\": 355, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.002830326557159424, \"percentile_inc_nulls\": 0.11504483222961426, \"value_count\": 279, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0016611218452453613, \"percentile_inc_nulls\": 0.11400723457336426, \"value_count\": 183, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0009072422981262207, \"percentile_inc_nulls\": 0.1133381724357605, \"value_count\": 118, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0005686283111572266, \"percentile_inc_nulls\": 0.11303764581680298, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0004088878631591797, \"percentile_inc_nulls\": 0.1128959059715271, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0002875328063964844, \"percentile_inc_nulls\": 0.11278820037841797, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 7.665157318115234e-05, \"percentile_inc_nulls\": 0.11260104179382324, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 3.1948089599609375e-05, \"percentile_inc_nulls\": 0.11256140470504761, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 6.377696990966797e-06, \"percentile_inc_nulls\": 0.11253869533538818, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.11253303289413452, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 14970, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 14970.0, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 19,847 values (11.3%) are null and there are 63 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 14970, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 12920, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 8320, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 6989, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 6177, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5909, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5905, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5228, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4824, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4723, \"group_name\": \"_state_\", \"value\": \"ia\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"mp\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 7, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 11, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 11, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 14970]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9656325578689575, \"percentile_inc_nulls\": 0.9748250842094421, \"value_count\": 4440, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4440.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.9436729550361633, \"percentile_inc_nulls\": 0.9587392210960388, \"value_count\": 2837, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2837.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.9245464205741882, \"percentile_inc_nulls\": 0.9447285532951355, \"value_count\": 2471, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2471.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.908012866973877, \"percentile_inc_nulls\": 0.9326174259185791, \"value_count\": 2136, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2136.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8943200707435608, \"percentile_inc_nulls\": 0.9225870966911316, \"value_count\": 1769, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1769.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.881432294845581, \"percentile_inc_nulls\": 0.9131464958190918, \"value_count\": 1665, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1665.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8725308179855347, \"percentile_inc_nulls\": 0.9066259860992432, \"value_count\": 1150, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1150.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8643027544021606, \"percentile_inc_nulls\": 0.9005987644195557, \"value_count\": 1063, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8566164970397949, \"percentile_inc_nulls\": 0.8949683904647827, \"value_count\": 993, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 993.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8491934537887573, \"percentile_inc_nulls\": 0.8895308375358582, \"value_count\": 959, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 959.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.841948390007019, \"percentile_inc_nulls\": 0.8842236995697021, \"value_count\": 936, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 936.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8355780839920044, \"percentile_inc_nulls\": 0.8795572519302368, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8303300142288208, \"percentile_inc_nulls\": 0.8757129907608032, \"value_count\": 678, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 678.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8251284956932068, \"percentile_inc_nulls\": 0.8719027638435364, \"value_count\": 672, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 672.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8199656009674072, \"percentile_inc_nulls\": 0.8681208491325378, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.814818263053894, \"percentile_inc_nulls\": 0.8643502593040466, \"value_count\": 665, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 665.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8097947239875793, \"percentile_inc_nulls\": 0.8606704473495483, \"value_count\": 649, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8050498366355896, \"percentile_inc_nulls\": 0.8571946620941162, \"value_count\": 613, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8005449175834656, \"percentile_inc_nulls\": 0.8538947105407715, \"value_count\": 582, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7961406111717224, \"percentile_inc_nulls\": 0.8506684899330139, \"value_count\": 569, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7919143438339233, \"percentile_inc_nulls\": 0.8475726842880249, \"value_count\": 546, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7879435420036316, \"percentile_inc_nulls\": 0.8446639180183411, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7840656042098999, \"percentile_inc_nulls\": 0.8418232202529907, \"value_count\": 501, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 501.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7802186012268066, \"percentile_inc_nulls\": 0.8390052318572998, \"value_count\": 497, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 497.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7763948440551758, \"percentile_inc_nulls\": 0.8362042903900146, \"value_count\": 494, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7726020216941833, \"percentile_inc_nulls\": 0.8334259390830994, \"value_count\": 490, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7688246965408325, \"percentile_inc_nulls\": 0.8306589722633362, \"value_count\": 488, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7613784074783325, \"percentile_inc_nulls\": 0.8252043724060059, \"value_count\": 481, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 962.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7577094435691833, \"percentile_inc_nulls\": 0.8225167989730835, \"value_count\": 474, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7504025101661682, \"percentile_inc_nulls\": 0.8171643018722534, \"value_count\": 472, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 944.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7467954754829407, \"percentile_inc_nulls\": 0.8145220875740051, \"value_count\": 466, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 466.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7432968020439148, \"percentile_inc_nulls\": 0.8119592070579529, \"value_count\": 452, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7398368120193481, \"percentile_inc_nulls\": 0.8094246983528137, \"value_count\": 447, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 447.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7364155650138855, \"percentile_inc_nulls\": 0.8069185614585876, \"value_count\": 442, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7331104278564453, \"percentile_inc_nulls\": 0.804497480392456, \"value_count\": 427, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7299058437347412, \"percentile_inc_nulls\": 0.8021500706672668, \"value_count\": 414, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7267787456512451, \"percentile_inc_nulls\": 0.7998594045639038, \"value_count\": 404, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7236748337745667, \"percentile_inc_nulls\": 0.7975857257843018, \"value_count\": 401, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 401.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7206483483314514, \"percentile_inc_nulls\": 0.7953687310218811, \"value_count\": 391, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 391.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7176682949066162, \"percentile_inc_nulls\": 0.7931857705116272, \"value_count\": 385, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7148430347442627, \"percentile_inc_nulls\": 0.7911162376403809, \"value_count\": 365, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 365.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7121338844299316, \"percentile_inc_nulls\": 0.7891317009925842, \"value_count\": 350, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7094789147377014, \"percentile_inc_nulls\": 0.787186861038208, \"value_count\": 343, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.706924557685852, \"percentile_inc_nulls\": 0.785315752029419, \"value_count\": 330, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7043780088424683, \"percentile_inc_nulls\": 0.7834503054618835, \"value_count\": 329, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 329.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7020094394683838, \"percentile_inc_nulls\": 0.7817152738571167, \"value_count\": 306, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6997259855270386, \"percentile_inc_nulls\": 0.7800426483154297, \"value_count\": 295, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 295.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6974812746047974, \"percentile_inc_nulls\": 0.778398334980011, \"value_count\": 290, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6953061819076538, \"percentile_inc_nulls\": 0.77680504322052, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 281.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6931775808334351, \"percentile_inc_nulls\": 0.7752457857131958, \"value_count\": 275, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.688951313495636, \"percentile_inc_nulls\": 0.7721499800682068, \"value_count\": 273, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6847715377807617, \"percentile_inc_nulls\": 0.7690881490707397, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6827589869499207, \"percentile_inc_nulls\": 0.76761394739151, \"value_count\": 260, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6807929277420044, \"percentile_inc_nulls\": 0.7661737203598022, \"value_count\": 254, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6788965463638306, \"percentile_inc_nulls\": 0.764784574508667, \"value_count\": 245, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6770620346069336, \"percentile_inc_nulls\": 0.7634407877922058, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.675250768661499, \"percentile_inc_nulls\": 0.7621140480041504, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.673454999923706, \"percentile_inc_nulls\": 0.7607985734939575, \"value_count\": 232, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6717057228088379, \"percentile_inc_nulls\": 0.7595171332359314, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.670002818107605, \"percentile_inc_nulls\": 0.758269727230072, \"value_count\": 220, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6683308482170105, \"percentile_inc_nulls\": 0.7570450305938721, \"value_count\": 216, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6666666269302368, \"percentile_inc_nulls\": 0.755825936794281, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6650102138519287, \"percentile_inc_nulls\": 0.7546125650405884, \"value_count\": 214, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6584153771400452, \"percentile_inc_nulls\": 0.7497817277908325, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 852.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6567744016647339, \"percentile_inc_nulls\": 0.7485796213150024, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6551489233970642, \"percentile_inc_nulls\": 0.7473889589309692, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6535311937332153, \"percentile_inc_nulls\": 0.7462038993835449, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6519289016723633, \"percentile_inc_nulls\": 0.7450302243232727, \"value_count\": 207, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.650334358215332, \"percentile_inc_nulls\": 0.7438621520996094, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6487475633621216, \"percentile_inc_nulls\": 0.7426998615264893, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6471762657165527, \"percentile_inc_nulls\": 0.7415488362312317, \"value_count\": 203, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 203.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6456127166748047, \"percentile_inc_nulls\": 0.7404034733772278, \"value_count\": 202, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6440646648406982, \"percentile_inc_nulls\": 0.739269495010376, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6425474882125854, \"percentile_inc_nulls\": 0.738158106803894, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.641038179397583, \"percentile_inc_nulls\": 0.7370525002479553, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6396061778068542, \"percentile_inc_nulls\": 0.7360035181045532, \"value_count\": 185, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6381819248199463, \"percentile_inc_nulls\": 0.7349602580070496, \"value_count\": 184, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6353799104690552, \"percentile_inc_nulls\": 0.7329077124595642, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6339865922927856, \"percentile_inc_nulls\": 0.7318871021270752, \"value_count\": 180, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6326165795326233, \"percentile_inc_nulls\": 0.7308834791183472, \"value_count\": 177, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6286225318908691, \"percentile_inc_nulls\": 0.7279577255249023, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.627298891544342, \"percentile_inc_nulls\": 0.7269881963729858, \"value_count\": 171, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6260062456130981, \"percentile_inc_nulls\": 0.726041316986084, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.624721348285675, \"percentile_inc_nulls\": 0.725100040435791, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6222134828567505, \"percentile_inc_nulls\": 0.7232630252838135, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6209672689437866, \"percentile_inc_nulls\": 0.7223501205444336, \"value_count\": 161, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6197519898414612, \"percentile_inc_nulls\": 0.7214599251747131, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6185522079467773, \"percentile_inc_nulls\": 0.7205810546875, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6173679828643799, \"percentile_inc_nulls\": 0.719713568687439, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6150148510932922, \"percentile_inc_nulls\": 0.7179898619651794, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6138460636138916, \"percentile_inc_nulls\": 0.7171337008476257, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6103628873825073, \"percentile_inc_nulls\": 0.7145822048187256, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6080716848373413, \"percentile_inc_nulls\": 0.7129038572311401, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6069415807723999, \"percentile_inc_nulls\": 0.7120760679244995, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6047123670578003, \"percentile_inc_nulls\": 0.7104430198669434, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6025450229644775, \"percentile_inc_nulls\": 0.7088554501533508, \"value_count\": 140, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6014846563339233, \"percentile_inc_nulls\": 0.7080786228179932, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5993791818618774, \"percentile_inc_nulls\": 0.7065364122390747, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5972893238067627, \"percentile_inc_nulls\": 0.7050055265426636, \"value_count\": 135, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5962598323822021, \"percentile_inc_nulls\": 0.7042514085769653, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5942318439483643, \"percentile_inc_nulls\": 0.7027658224105835, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 262.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5912131071090698, \"percentile_inc_nulls\": 0.7005544900894165, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5902145504951477, \"percentile_inc_nulls\": 0.6998230814933777, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5892238020896912, \"percentile_inc_nulls\": 0.6990973353385925, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5872577428817749, \"percentile_inc_nulls\": 0.6976571083068848, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5833565592765808, \"percentile_inc_nulls\": 0.6947994232177734, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5823889970779419, \"percentile_inc_nulls\": 0.694090723991394, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5804848670959473, \"percentile_inc_nulls\": 0.6926958560943604, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.577651858329773, \"percentile_inc_nulls\": 0.6906206607818604, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.574842095375061, \"percentile_inc_nulls\": 0.6885623931884766, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 363.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5739209651947021, \"percentile_inc_nulls\": 0.6878876686096191, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5730153322219849, \"percentile_inc_nulls\": 0.687224268913269, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5712195634841919, \"percentile_inc_nulls\": 0.685908854007721, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5694392919540405, \"percentile_inc_nulls\": 0.6846047639846802, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5685569047927856, \"percentile_inc_nulls\": 0.6839583516120911, \"value_count\": 114, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5659328699111938, \"percentile_inc_nulls\": 0.6820362210273743, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5641990303993225, \"percentile_inc_nulls\": 0.6807661056518555, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5624806880950928, \"percentile_inc_nulls\": 0.6795073747634888, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5616291761398315, \"percentile_inc_nulls\": 0.6788836717605591, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5599417686462402, \"percentile_inc_nulls\": 0.677647590637207, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5574339032173157, \"percentile_inc_nulls\": 0.6758105158805847, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5541210174560547, \"percentile_inc_nulls\": 0.6733837723731995, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5516595840454102, \"percentile_inc_nulls\": 0.6715806722640991, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5508468151092529, \"percentile_inc_nulls\": 0.6709853410720825, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.547626793384552, \"percentile_inc_nulls\": 0.668626606464386, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5452350378036499, \"percentile_inc_nulls\": 0.6668745279312134, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5436559915542603, \"percentile_inc_nulls\": 0.6657178401947021, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5420923829078674, \"percentile_inc_nulls\": 0.664572536945343, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5389962196350098, \"percentile_inc_nulls\": 0.6623045206069946, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5351647138595581, \"percentile_inc_nulls\": 0.659497857093811, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5328890085220337, \"percentile_inc_nulls\": 0.6578308343887329, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5313873887062073, \"percentile_inc_nulls\": 0.6567308902740479, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5291581153869629, \"percentile_inc_nulls\": 0.6550979614257812, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5284228324890137, \"percentile_inc_nulls\": 0.6545592546463013, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5247848033905029, \"percentile_inc_nulls\": 0.6518943309783936, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 470.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5211855173110962, \"percentile_inc_nulls\": 0.6492577791213989, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5204733610153198, \"percentile_inc_nulls\": 0.6487361192703247, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5176868438720703, \"percentile_inc_nulls\": 0.6466948986053467, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5163090229034424, \"percentile_inc_nulls\": 0.6456856727600098, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5129032731056213, \"percentile_inc_nulls\": 0.643190860748291, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5095362067222595, \"percentile_inc_nulls\": 0.6407244205474854, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5068734884262085, \"percentile_inc_nulls\": 0.6387739181518555, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5042417049407959, \"percentile_inc_nulls\": 0.6368460655212402, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5035915374755859, \"percentile_inc_nulls\": 0.6363698244094849, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5023066401481628, \"percentile_inc_nulls\": 0.6354286074638367, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4984983801841736, \"percentile_inc_nulls\": 0.6326389312744141, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4947364926338196, \"percentile_inc_nulls\": 0.6298832893371582, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 486.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.49225956201553345, \"percentile_inc_nulls\": 0.6280689239501953, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.49042510986328125, \"percentile_inc_nulls\": 0.6267250776290894, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4886138439178467, \"percentile_inc_nulls\": 0.6253982782363892, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4880255460739136, \"percentile_inc_nulls\": 0.6249673962593079, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.48396188020706177, \"percentile_inc_nulls\": 0.6219906210899353, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.48109787702560425, \"percentile_inc_nulls\": 0.6198927164077759, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47940272092819214, \"percentile_inc_nulls\": 0.6186509728431702, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47773081064224243, \"percentile_inc_nulls\": 0.6174262762069702, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47608208656311035, \"percentile_inc_nulls\": 0.6162185668945312, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4722893238067627, \"percentile_inc_nulls\": 0.6134402751922607, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47015297412872314, \"percentile_inc_nulls\": 0.6118752956390381, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46752119064331055, \"percentile_inc_nulls\": 0.6099475026130676, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4659653902053833, \"percentile_inc_nulls\": 0.6088078022003174, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46494364738464355, \"percentile_inc_nulls\": 0.6080594062805176, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46293115615844727, \"percentile_inc_nulls\": 0.6065851449966431, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46194034814834595, \"percentile_inc_nulls\": 0.6058593988418579, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46047741174697876, \"percentile_inc_nulls\": 0.6047877669334412, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4566381573677063, \"percentile_inc_nulls\": 0.6019754409790039, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.45286083221435547, \"percentile_inc_nulls\": 0.5992084741592407, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4496098756790161, \"percentile_inc_nulls\": 0.5968270301818848, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4477831721305847, \"percentile_inc_nulls\": 0.5954889059066772, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.44374263286590576, \"percentile_inc_nulls\": 0.5925291776657104, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 522.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.439771831035614, \"percentile_inc_nulls\": 0.5896204710006714, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.43760448694229126, \"percentile_inc_nulls\": 0.5880328416824341, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.43292152881622314, \"percentile_inc_nulls\": 0.5846024751663208, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 605.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4308316111564636, \"percentile_inc_nulls\": 0.5830715894699097, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4263189435005188, \"percentile_inc_nulls\": 0.5797659158706665, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 583.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4222939610481262, \"percentile_inc_nulls\": 0.576817512512207, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 520.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.41716206073760986, \"percentile_inc_nulls\": 0.5730583071708679, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4105827212333679, \"percentile_inc_nulls\": 0.5682387351989746, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 850.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4083070158958435, \"percentile_inc_nulls\": 0.566571831703186, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4038485288619995, \"percentile_inc_nulls\": 0.5633058547973633, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.40202951431274414, \"percentile_inc_nulls\": 0.5619733929634094, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3913477659225464, \"percentile_inc_nulls\": 0.55414879322052, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1380.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3882129192352295, \"percentile_inc_nulls\": 0.5518524050712585, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 405.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.38174188137054443, \"percentile_inc_nulls\": 0.547112226486206, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 836.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3760836720466614, \"percentile_inc_nulls\": 0.5429674386978149, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 731.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.37250757217407227, \"percentile_inc_nulls\": 0.5403479337692261, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.368699312210083, \"percentile_inc_nulls\": 0.5375582575798035, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.36374545097351074, \"percentile_inc_nulls\": 0.5339294672012329, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3577079176902771, \"percentile_inc_nulls\": 0.5295068025588989, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.35182517766952515, \"percentile_inc_nulls\": 0.5251976251602173, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.34180134534835815, \"percentile_inc_nulls\": 0.5178549289703369, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1295.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3359495997428894, \"percentile_inc_nulls\": 0.5135684013366699, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.33269864320755005, \"percentile_inc_nulls\": 0.511186957359314, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.32690876722335815, \"percentile_inc_nulls\": 0.506945788860321, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3223109841346741, \"percentile_inc_nulls\": 0.5035778284072876, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3128986358642578, \"percentile_inc_nulls\": 0.4966830611228943, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.30881941318511963, \"percentile_inc_nulls\": 0.49369490146636963, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 527.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.28861695528030396, \"percentile_inc_nulls\": 0.4788961410522461, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2610.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2814338207244873, \"percentile_inc_nulls\": 0.47363436222076416, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.27428168058395386, \"percentile_inc_nulls\": 0.46839529275894165, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 924.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2682209610939026, \"percentile_inc_nulls\": 0.4639556407928467, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.26479965448379517, \"percentile_inc_nulls\": 0.4614495038986206, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2576397657394409, \"percentile_inc_nulls\": 0.45620471239089966, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 925.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.24946588277816772, \"percentile_inc_nulls\": 0.45021718740463257, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1056.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1960570216178894, \"percentile_inc_nulls\": 0.41109395027160645, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6900.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1882237195968628, \"percentile_inc_nulls\": 0.40535593032836914, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.18009626865386963, \"percentile_inc_nulls\": 0.399402379989624, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.17328470945358276, \"percentile_inc_nulls\": 0.3944127559661865, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.16504889726638794, \"percentile_inc_nulls\": 0.38837987184524536, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1064.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.15641063451766968, \"percentile_inc_nulls\": 0.38205212354660034, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1116.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.15062075853347778, \"percentile_inc_nulls\": 0.37781089544296265, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.14058917760849, \"percentile_inc_nulls\": 0.3704625368118286, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1296.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1346677541732788, \"percentile_inc_nulls\": 0.3661249876022339, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 765.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.11852127313613892, \"percentile_inc_nulls\": 0.35429733991622925, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2086.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.11147749423980713, \"percentile_inc_nulls\": 0.3491376042366028, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 910.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.10451114177703857, \"percentile_inc_nulls\": 0.3440345525741577, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.09846585988998413, \"percentile_inc_nulls\": 0.33960628509521484, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 781.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0922735333442688, \"percentile_inc_nulls\": 0.33507025241851807, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 800.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.08335655927658081, \"percentile_inc_nulls\": 0.32853835821151733, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1152.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.07778346538543701, \"percentile_inc_nulls\": 0.32445597648620605, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.015798211097717285, \"percentile_inc_nulls\": 0.27905040979385376, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8008.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.011850595474243164, \"percentile_inc_nulls\": 0.2761586904525757, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.008251309394836426, \"percentile_inc_nulls\": 0.2735220789909363, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0051860809326171875, \"percentile_inc_nulls\": 0.2712767720222473, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.002980053424835205, \"percentile_inc_nulls\": 0.2696608304977417, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.001215219497680664, \"percentile_inc_nulls\": 0.26836806535720825, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.2674778699874878, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 4440, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4440.0, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 47,174 values (26.7%) are null and there are 4230 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 4440, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2837, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2471, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2136, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1769, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1665, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1150, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1063, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 993, \"group_name\": \"_city_\", \"value\": \"chapel hill\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 959, \"group_name\": \"_city_\", \"value\": \"omaha\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"canada     l6h 7h7\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"granite bay\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"black mountain\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"schereville\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ontario cn n2z2x6\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 4440]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9821445345878601, \"percentile_inc_nulls\": 0.9893006682395935, \"value_count\": 1887, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1887.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9768267273902893, \"percentile_inc_nulls\": 0.9861140847206116, \"value_count\": 562, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9727673530578613, \"percentile_inc_nulls\": 0.9836816787719727, \"value_count\": 429, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9687647819519043, \"percentile_inc_nulls\": 0.9812832474708557, \"value_count\": 423, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9648095369338989, \"percentile_inc_nulls\": 0.9789131879806519, \"value_count\": 418, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9614787697792053, \"percentile_inc_nulls\": 0.9769173264503479, \"value_count\": 352, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9581764340400696, \"percentile_inc_nulls\": 0.9749384522438049, \"value_count\": 349, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9554323554039001, \"percentile_inc_nulls\": 0.973294198513031, \"value_count\": 290, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.952915370464325, \"percentile_inc_nulls\": 0.9717859625816345, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9504267573356628, \"percentile_inc_nulls\": 0.970294713973999, \"value_count\": 263, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 263.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9479759931564331, \"percentile_inc_nulls\": 0.968826174736023, \"value_count\": 259, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9457144737243652, \"percentile_inc_nulls\": 0.9674710631370544, \"value_count\": 239, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 239.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9415321350097656, \"percentile_inc_nulls\": 0.9649649262428284, \"value_count\": 221, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9396585822105408, \"percentile_inc_nulls\": 0.9638422131538391, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9378323554992676, \"percentile_inc_nulls\": 0.9627479314804077, \"value_count\": 193, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9360250234603882, \"percentile_inc_nulls\": 0.9616649746894836, \"value_count\": 191, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.934425950050354, \"percentile_inc_nulls\": 0.9607067108154297, \"value_count\": 169, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9328362345695496, \"percentile_inc_nulls\": 0.9597541689872742, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9312560558319092, \"percentile_inc_nulls\": 0.9588072299957275, \"value_count\": 167, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9297041893005371, \"percentile_inc_nulls\": 0.9578773975372314, \"value_count\": 164, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9266951680183411, \"percentile_inc_nulls\": 0.9560742974281311, \"value_count\": 159, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9252285361289978, \"percentile_inc_nulls\": 0.955195426940918, \"value_count\": 155, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9238280653953552, \"percentile_inc_nulls\": 0.95435631275177, \"value_count\": 148, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9197971224784851, \"percentile_inc_nulls\": 0.9519408345222473, \"value_count\": 142, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9184629321098328, \"percentile_inc_nulls\": 0.951141357421875, \"value_count\": 141, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9171571135520935, \"percentile_inc_nulls\": 0.9503589272499084, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9132680892944336, \"percentile_inc_nulls\": 0.9480285048484802, \"value_count\": 137, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 411.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9119907021522522, \"percentile_inc_nulls\": 0.9472630620002747, \"value_count\": 135, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9107416868209839, \"percentile_inc_nulls\": 0.9465146064758301, \"value_count\": 132, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9095020890235901, \"percentile_inc_nulls\": 0.9457718729972839, \"value_count\": 131, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9083003997802734, \"percentile_inc_nulls\": 0.9450517892837524, \"value_count\": 127, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9071081280708313, \"percentile_inc_nulls\": 0.9443373680114746, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9059253334999084, \"percentile_inc_nulls\": 0.9436286091804504, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.904780387878418, \"percentile_inc_nulls\": 0.9429425001144409, \"value_count\": 121, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9036449193954468, \"percentile_inc_nulls\": 0.9422621130943298, \"value_count\": 120, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9014496207237244, \"percentile_inc_nulls\": 0.9409466981887817, \"value_count\": 116, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9003614783287048, \"percentile_inc_nulls\": 0.940294623374939, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8982418775558472, \"percentile_inc_nulls\": 0.9390245079994202, \"value_count\": 112, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8972010612487793, \"percentile_inc_nulls\": 0.9384008049964905, \"value_count\": 110, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8951382637023926, \"percentile_inc_nulls\": 0.9371647834777832, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8941162824630737, \"percentile_inc_nulls\": 0.9365524053573608, \"value_count\": 108, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8931133151054382, \"percentile_inc_nulls\": 0.9359513521194458, \"value_count\": 106, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8921197652816772, \"percentile_inc_nulls\": 0.9353560209274292, \"value_count\": 105, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8911356925964355, \"percentile_inc_nulls\": 0.9347663521766663, \"value_count\": 104, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8901799917221069, \"percentile_inc_nulls\": 0.9341936707496643, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8883064389228821, \"percentile_inc_nulls\": 0.9330710172653198, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8864896297454834, \"percentile_inc_nulls\": 0.9319823384284973, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8855907320976257, \"percentile_inc_nulls\": 0.9314436912536621, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8847012519836426, \"percentile_inc_nulls\": 0.9309107065200806, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8838212490081787, \"percentile_inc_nulls\": 0.9303833842277527, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8820991516113281, \"percentile_inc_nulls\": 0.9293514490127563, \"value_count\": 91, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8812569975852966, \"percentile_inc_nulls\": 0.9288468360900879, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8795915842056274, \"percentile_inc_nulls\": 0.9278489351272583, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8787683844566345, \"percentile_inc_nulls\": 0.9273555874824524, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8771408796310425, \"percentile_inc_nulls\": 0.9263803958892822, \"value_count\": 86, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8747279644012451, \"percentile_inc_nulls\": 0.9249345064163208, \"value_count\": 85, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8739426136016846, \"percentile_inc_nulls\": 0.9244638681411743, \"value_count\": 83, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8724097013473511, \"percentile_inc_nulls\": 0.9235453605651855, \"value_count\": 81, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8708956837654114, \"percentile_inc_nulls\": 0.9226381778717041, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8701576590538025, \"percentile_inc_nulls\": 0.9221959114074707, \"value_count\": 78, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8694290518760681, \"percentile_inc_nulls\": 0.921759307384491, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8672716617584229, \"percentile_inc_nulls\": 0.9204665422439575, \"value_count\": 76, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8658522367477417, \"percentile_inc_nulls\": 0.9196160435676575, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8651520609855652, \"percentile_inc_nulls\": 0.9191964268684387, \"value_count\": 74, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8630797863006592, \"percentile_inc_nulls\": 0.917954683303833, \"value_count\": 73, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8617172241210938, \"percentile_inc_nulls\": 0.9171382188796997, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8590677976608276, \"percentile_inc_nulls\": 0.9155505895614624, \"value_count\": 70, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.858414888381958, \"percentile_inc_nulls\": 0.9151594042778015, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8564845323562622, \"percentile_inc_nulls\": 0.9140027165412903, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8552166223526001, \"percentile_inc_nulls\": 0.9132429361343384, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8545920848846436, \"percentile_inc_nulls\": 0.9128686785697937, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8539770245552063, \"percentile_inc_nulls\": 0.9125001430511475, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8521602749824524, \"percentile_inc_nulls\": 0.9114115238189697, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8515641093254089, \"percentile_inc_nulls\": 0.911054253578186, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.849804162979126, \"percentile_inc_nulls\": 0.909999668598175, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8492269515991211, \"percentile_inc_nulls\": 0.9096537828445435, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8452527523040771, \"percentile_inc_nulls\": 0.9072723984718323, \"value_count\": 60, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8430196046829224, \"percentile_inc_nulls\": 0.9059342741966248, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8419219851493835, \"percentile_inc_nulls\": 0.9052765369415283, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8408433198928833, \"percentile_inc_nulls\": 0.9046301245689392, \"value_count\": 57, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8397834897041321, \"percentile_inc_nulls\": 0.9039950966835022, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8356200456619263, \"percentile_inc_nulls\": 0.9015002846717834, \"value_count\": 55, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8330652117729187, \"percentile_inc_nulls\": 0.8999693989753723, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8320622444152832, \"percentile_inc_nulls\": 0.8993683457374573, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8291099667549133, \"percentile_inc_nulls\": 0.8975993394851685, \"value_count\": 52, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8262144923210144, \"percentile_inc_nulls\": 0.8958643078804016, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8238489031791687, \"percentile_inc_nulls\": 0.8944467902183533, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8224579095840454, \"percentile_inc_nulls\": 0.8936132788658142, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8188244104385376, \"percentile_inc_nulls\": 0.891435980796814, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8161560297012329, \"percentile_inc_nulls\": 0.8898370265960693, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8126738667488098, \"percentile_inc_nulls\": 0.887750506401062, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8105448484420776, \"percentile_inc_nulls\": 0.8864747285842896, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8076304197311401, \"percentile_inc_nulls\": 0.8847283720970154, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8055960536003113, \"percentile_inc_nulls\": 0.8835092782974243, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.801224410533905, \"percentile_inc_nulls\": 0.8808897733688354, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8004485368728638, \"percentile_inc_nulls\": 0.8804247975349426, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.796285092830658, \"percentile_inc_nulls\": 0.8779299855232239, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7944399118423462, \"percentile_inc_nulls\": 0.8768243193626404, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7912037968635559, \"percentile_inc_nulls\": 0.8748852014541626, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7887530326843262, \"percentile_inc_nulls\": 0.8734166622161865, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7843246459960938, \"percentile_inc_nulls\": 0.8707630634307861, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7810128331184387, \"percentile_inc_nulls\": 0.8687785863876343, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7771521806716919, \"percentile_inc_nulls\": 0.8664652109146118, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7730928659439087, \"percentile_inc_nulls\": 0.8640327453613281, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7676425576210022, \"percentile_inc_nulls\": 0.8607668280601501, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7632425427436829, \"percentile_inc_nulls\": 0.8581302165985107, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7578490376472473, \"percentile_inc_nulls\": 0.8548983335494995, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7540073394775391, \"percentile_inc_nulls\": 0.8525962829589844, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7471187114715576, \"percentile_inc_nulls\": 0.848468542098999, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 728.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7412425875663757, \"percentile_inc_nulls\": 0.8449474573135376, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7358301281929016, \"percentile_inc_nulls\": 0.8417041897773743, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 572.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7301527261734009, \"percentile_inc_nulls\": 0.8383021354675293, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 600.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7231127023696899, \"percentile_inc_nulls\": 0.8340836763381958, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7181071639060974, \"percentile_inc_nulls\": 0.8310842514038086, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 529.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6414999961853027, \"percentile_inc_nulls\": 0.7851796746253967, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8096.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6252058148384094, \"percentile_inc_nulls\": 0.7754158973693848, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1722.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6036316156387329, \"percentile_inc_nulls\": 0.7624882459640503, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2280.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5906871557235718, \"percentile_inc_nulls\": 0.7547316551208496, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1368.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5736548900604248, \"percentile_inc_nulls\": 0.7445255517959595, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5601426959037781, \"percentile_inc_nulls\": 0.7364287972450256, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1428.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.545457124710083, \"percentile_inc_nulls\": 0.7276289463043213, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1552.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5306958556175232, \"percentile_inc_nulls\": 0.7187836766242981, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1560.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5112223625183105, \"percentile_inc_nulls\": 0.7071147561073303, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2058.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.46656954288482666, \"percentile_inc_nulls\": 0.6803579330444336, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4719.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.4002573490142822, \"percentile_inc_nulls\": 0.6406223773956299, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7008.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.3459245562553406, \"percentile_inc_nulls\": 0.6080650091171265, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5742.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.3001267910003662, \"percentile_inc_nulls\": 0.5806220769882202, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4840.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.2458791732788086, \"percentile_inc_nulls\": 0.5481158494949341, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5733.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.2023523449897766, \"percentile_inc_nulls\": 0.52203369140625, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4600.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.13657957315444946, \"percentile_inc_nulls\": 0.48262137174606323, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6951.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.10535377264022827, \"percentile_inc_nulls\": 0.4639102816581726, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3300.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.07720333337783813, \"percentile_inc_nulls\": 0.44704192876815796, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2975.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.0521470308303833, \"percentile_inc_nulls\": 0.4320276975631714, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2648.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.03261673450469971, \"percentile_inc_nulls\": 0.42032480239868164, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2064.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.014959990978240967, \"percentile_inc_nulls\": 0.4097445011138916, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1866.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.40078020095825195, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1581.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1887, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1887.0, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 70,684 values (40.1%) are null and there are 10844 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1887, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 562, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd.\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 429, \"group_name\": \"_street_address_\", \"value\": \"130 roberts street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 423, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st, suite 200\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 418, \"group_name\": \"_street_address_\", \"value\": \"300 exelon way\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 352, \"group_name\": \"_street_address_\", \"value\": \"14302 fnb parkway\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 349, \"group_name\": \"_street_address_\", \"value\": \"1519 king street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 290, \"group_name\": \"_street_address_\", \"value\": \"804 carnegie center\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"5400 westheimer court\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 263, \"group_name\": \"_street_address_\", \"value\": \"333 washington street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"401 n michigan ave\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"c/o soltage llc, 66 york street, 5th floor\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"c/o enel north america  inc\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1300 n 17th st.\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"12200 ashcake rd\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 1887]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9781083464622498, \"percentile_inc_nulls\": 0.984095573425293, \"value_count\": 2805, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2805.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9638494849205017, \"percentile_inc_nulls\": 0.9737364053726196, \"value_count\": 1827, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1827.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9563883543014526, \"percentile_inc_nulls\": 0.9683158993721008, \"value_count\": 956, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 956.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9495516419410706, \"percentile_inc_nulls\": 0.963348925113678, \"value_count\": 876, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 876.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9427851438522339, \"percentile_inc_nulls\": 0.9584330320358276, \"value_count\": 867, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 867.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9373453855514526, \"percentile_inc_nulls\": 0.9544810056686401, \"value_count\": 697, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 697.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9322100281715393, \"percentile_inc_nulls\": 0.9507501721382141, \"value_count\": 658, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9273243546485901, \"percentile_inc_nulls\": 0.9472007155418396, \"value_count\": 626, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 626.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.922735333442688, \"percentile_inc_nulls\": 0.9438667297363281, \"value_count\": 588, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9182945489883423, \"percentile_inc_nulls\": 0.9406405091285706, \"value_count\": 569, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9141347408294678, \"percentile_inc_nulls\": 0.9376183748245239, \"value_count\": 533, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9102013111114502, \"percentile_inc_nulls\": 0.9347606897354126, \"value_count\": 504, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.906283438205719, \"percentile_inc_nulls\": 0.9319143295288086, \"value_count\": 502, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 502.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9026777148246765, \"percentile_inc_nulls\": 0.929294764995575, \"value_count\": 462, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.89908766746521, \"percentile_inc_nulls\": 0.9266865253448486, \"value_count\": 460, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8955131769180298, \"percentile_inc_nulls\": 0.9240896701812744, \"value_count\": 458, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 458.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8920089602470398, \"percentile_inc_nulls\": 0.9215438365936279, \"value_count\": 449, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8885515928268433, \"percentile_inc_nulls\": 0.9190320372581482, \"value_count\": 443, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 443.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8852112293243408, \"percentile_inc_nulls\": 0.9166052341461182, \"value_count\": 428, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8819957971572876, \"percentile_inc_nulls\": 0.914269208908081, \"value_count\": 412, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8788739442825317, \"percentile_inc_nulls\": 0.9120011925697327, \"value_count\": 400, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8757911920547485, \"percentile_inc_nulls\": 0.9097615480422974, \"value_count\": 395, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 395.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8730127811431885, \"percentile_inc_nulls\": 0.907742977142334, \"value_count\": 356, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 356.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.870296835899353, \"percentile_inc_nulls\": 0.9057698249816895, \"value_count\": 348, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8681193590164185, \"percentile_inc_nulls\": 0.9041878581047058, \"value_count\": 279, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8659496903419495, \"percentile_inc_nulls\": 0.9026116132736206, \"value_count\": 278, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8638424873352051, \"percentile_inc_nulls\": 0.9010807275772095, \"value_count\": 270, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8597372770309448, \"percentile_inc_nulls\": 0.8980982899665833, \"value_count\": 263, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8577159643173218, \"percentile_inc_nulls\": 0.8966297507286072, \"value_count\": 259, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8557179570198059, \"percentile_inc_nulls\": 0.8951781988143921, \"value_count\": 256, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8537512421607971, \"percentile_inc_nulls\": 0.8937493562698364, \"value_count\": 252, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8517923355102539, \"percentile_inc_nulls\": 0.8923261761665344, \"value_count\": 251, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8498489856719971, \"percentile_inc_nulls\": 0.8909143209457397, \"value_count\": 249, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8479368686676025, \"percentile_inc_nulls\": 0.8895251750946045, \"value_count\": 245, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8460325598716736, \"percentile_inc_nulls\": 0.8881416916847229, \"value_count\": 244, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8441438674926758, \"percentile_inc_nulls\": 0.8867695331573486, \"value_count\": 242, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8422864079475403, \"percentile_inc_nulls\": 0.8854200839996338, \"value_count\": 238, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8404601812362671, \"percentile_inc_nulls\": 0.8840932846069336, \"value_count\": 234, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8368232250213623, \"percentile_inc_nulls\": 0.8814510703086853, \"value_count\": 233, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 466.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8332019448280334, \"percentile_inc_nulls\": 0.8788201808929443, \"value_count\": 232, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8314459323883057, \"percentile_inc_nulls\": 0.8775444030761719, \"value_count\": 225, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8279807567596436, \"percentile_inc_nulls\": 0.8750269412994385, \"value_count\": 222, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 444.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8262715339660645, \"percentile_inc_nulls\": 0.8737851977348328, \"value_count\": 219, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8246170282363892, \"percentile_inc_nulls\": 0.8725831508636475, \"value_count\": 212, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8229702711105347, \"percentile_inc_nulls\": 0.8713867664337158, \"value_count\": 211, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8213468790054321, \"percentile_inc_nulls\": 0.8702074289321899, \"value_count\": 208, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8198094367980957, \"percentile_inc_nulls\": 0.8690904378890991, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8183343410491943, \"percentile_inc_nulls\": 0.8680187463760376, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8168749213218689, \"percentile_inc_nulls\": 0.866958498954773, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8154311180114746, \"percentile_inc_nulls\": 0.8659095168113708, \"value_count\": 185, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8139950633049011, \"percentile_inc_nulls\": 0.8648662567138672, \"value_count\": 184, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8125668168067932, \"percentile_inc_nulls\": 0.8638286590576172, \"value_count\": 183, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8097416162490845, \"percentile_inc_nulls\": 0.8617761135101318, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8083524107933044, \"percentile_inc_nulls\": 0.8607668280601501, \"value_count\": 178, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8069709539413452, \"percentile_inc_nulls\": 0.8597632050514221, \"value_count\": 177, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8056130409240723, \"percentile_inc_nulls\": 0.8587766289710999, \"value_count\": 174, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8042628169059753, \"percentile_inc_nulls\": 0.8577957153320312, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8029204607009888, \"percentile_inc_nulls\": 0.8568204641342163, \"value_count\": 172, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8002669215202332, \"percentile_inc_nulls\": 0.8548926711082458, \"value_count\": 170, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7976446151733398, \"percentile_inc_nulls\": 0.85298752784729, \"value_count\": 168, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7950378656387329, \"percentile_inc_nulls\": 0.8510937690734863, \"value_count\": 167, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 334.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7937423586845398, \"percentile_inc_nulls\": 0.8501524925231934, \"value_count\": 166, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7924546003341675, \"percentile_inc_nulls\": 0.8492169380187988, \"value_count\": 165, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7911824584007263, \"percentile_inc_nulls\": 0.8482927680015564, \"value_count\": 163, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 163.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7886537909507751, \"percentile_inc_nulls\": 0.8464556932449341, \"value_count\": 162, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7861407399177551, \"percentile_inc_nulls\": 0.8446298837661743, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 322.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7848920226097107, \"percentile_inc_nulls\": 0.8437227010726929, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7836511135101318, \"percentile_inc_nulls\": 0.8428211808204651, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7824180126190186, \"percentile_inc_nulls\": 0.841925323009491, \"value_count\": 158, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7812082767486572, \"percentile_inc_nulls\": 0.8410464525222778, \"value_count\": 155, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7800064086914062, \"percentile_inc_nulls\": 0.8401732444763184, \"value_count\": 154, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7788200974464417, \"percentile_inc_nulls\": 0.839311420917511, \"value_count\": 152, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7776416540145874, \"percentile_inc_nulls\": 0.8384552597999573, \"value_count\": 151, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.776470959186554, \"percentile_inc_nulls\": 0.8376047611236572, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7753158807754517, \"percentile_inc_nulls\": 0.8367655873298645, \"value_count\": 148, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7741842269897461, \"percentile_inc_nulls\": 0.8359434604644775, \"value_count\": 145, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7730681896209717, \"percentile_inc_nulls\": 0.8351325988769531, \"value_count\": 143, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7719599604606628, \"percentile_inc_nulls\": 0.8343274593353271, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7708594799041748, \"percentile_inc_nulls\": 0.8335279822349548, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7687054872512817, \"percentile_inc_nulls\": 0.831963062286377, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.767636239528656, \"percentile_inc_nulls\": 0.8311862945556641, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7655134201049805, \"percentile_inc_nulls\": 0.8296440243721008, \"value_count\": 136, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7644597887992859, \"percentile_inc_nulls\": 0.8288785815238953, \"value_count\": 135, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7634140253067017, \"percentile_inc_nulls\": 0.8281188011169434, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7623916268348694, \"percentile_inc_nulls\": 0.8273760080337524, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7603780627250671, \"percentile_inc_nulls\": 0.8259131908416748, \"value_count\": 129, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7593790888786316, \"percentile_inc_nulls\": 0.8251873850822449, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7573966979980469, \"percentile_inc_nulls\": 0.8237472176551819, \"value_count\": 127, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7564133405685425, \"percentile_inc_nulls\": 0.823032796382904, \"value_count\": 126, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7554377913475037, \"percentile_inc_nulls\": 0.8223240375518799, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7515667676925659, \"percentile_inc_nulls\": 0.8195117115974426, \"value_count\": 124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.750614583492279, \"percentile_inc_nulls\": 0.8188199400901794, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7487415075302124, \"percentile_inc_nulls\": 0.8174591660499573, \"value_count\": 120, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7478127479553223, \"percentile_inc_nulls\": 0.8167843818664551, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7450499534606934, \"percentile_inc_nulls\": 0.8147772550582886, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7441368699073792, \"percentile_inc_nulls\": 0.8141138553619385, \"value_count\": 117, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7432393431663513, \"percentile_inc_nulls\": 0.8134617805480957, \"value_count\": 115, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7423496246337891, \"percentile_inc_nulls\": 0.8128154277801514, \"value_count\": 114, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7414677143096924, \"percentile_inc_nulls\": 0.8121746778488159, \"value_count\": 113, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7397195100784302, \"percentile_inc_nulls\": 0.8109046220779419, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7388532161712646, \"percentile_inc_nulls\": 0.8102751970291138, \"value_count\": 111, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7379946708679199, \"percentile_inc_nulls\": 0.8096514940261841, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7362933158874512, \"percentile_inc_nulls\": 0.8084154725074768, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7354504466056824, \"percentile_inc_nulls\": 0.8078030943870544, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7346231937408447, \"percentile_inc_nulls\": 0.8072021007537842, \"value_count\": 106, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.732984185218811, \"percentile_inc_nulls\": 0.8060113787651062, \"value_count\": 105, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.731360912322998, \"percentile_inc_nulls\": 0.8048319816589355, \"value_count\": 104, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7281454205513, \"percentile_inc_nulls\": 0.8024959564208984, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7273493409156799, \"percentile_inc_nulls\": 0.8019176125526428, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7257884740829468, \"percentile_inc_nulls\": 0.800783634185791, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.725023627281189, \"percentile_inc_nulls\": 0.80022794008255, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7235095500946045, \"percentile_inc_nulls\": 0.7991279363632202, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7227603197097778, \"percentile_inc_nulls\": 0.7985836267471313, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7212774753570557, \"percentile_inc_nulls\": 0.7975063323974609, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7190765738487244, \"percentile_inc_nulls\": 0.7959073781967163, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7168990969657898, \"percentile_inc_nulls\": 0.7943254709243774, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7140582799911499, \"percentile_inc_nulls\": 0.79226154088974, \"value_count\": 91, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7133558988571167, \"percentile_inc_nulls\": 0.7917512655258179, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7112720608711243, \"percentile_inc_nulls\": 0.7902373671531677, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7085248827934265, \"percentile_inc_nulls\": 0.7882415056228638, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7064878940582275, \"percentile_inc_nulls\": 0.7867616415023804, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7038031816482544, \"percentile_inc_nulls\": 0.7848111391067505, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7018129825592041, \"percentile_inc_nulls\": 0.7833652496337891, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7005017995834351, \"percentile_inc_nulls\": 0.7824127078056335, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6998540163040161, \"percentile_inc_nulls\": 0.7819421291351318, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6979341506958008, \"percentile_inc_nulls\": 0.7805472612380981, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6960376501083374, \"percentile_inc_nulls\": 0.7791694402694702, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6954132914543152, \"percentile_inc_nulls\": 0.7787158489227295, \"value_count\": 80, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6947967410087585, \"percentile_inc_nulls\": 0.7782679200172424, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6917529702186584, \"percentile_inc_nulls\": 0.7760565876960754, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6899501085281372, \"percentile_inc_nulls\": 0.774746835231781, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6875775456428528, \"percentile_inc_nulls\": 0.7730231285095215, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6869922280311584, \"percentile_inc_nulls\": 0.7725979089736938, \"value_count\": 75, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6841045618057251, \"percentile_inc_nulls\": 0.7705000042915344, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6812559366226196, \"percentile_inc_nulls\": 0.7684304118156433, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 365.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6806939840316772, \"percentile_inc_nulls\": 0.7680221796035767, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6779233813285828, \"percentile_inc_nulls\": 0.7660093307495117, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6762844324111938, \"percentile_inc_nulls\": 0.7648186087608337, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6719763278961182, \"percentile_inc_nulls\": 0.761688768863678, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6687921285629272, \"percentile_inc_nulls\": 0.7593753933906555, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6656546592712402, \"percentile_inc_nulls\": 0.7570960521697998, \"value_count\": 67, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 402.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6635942459106445, \"percentile_inc_nulls\": 0.7555991411209106, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6585213541984558, \"percentile_inc_nulls\": 0.7519136667251587, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 650.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6555244326591492, \"percentile_inc_nulls\": 0.7497363090515137, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6525743007659912, \"percentile_inc_nulls\": 0.7475930452346802, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6506388187408447, \"percentile_inc_nulls\": 0.7461869120597839, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6477823257446289, \"percentile_inc_nulls\": 0.7441116571426392, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6449726819992065, \"percentile_inc_nulls\": 0.7420704364776611, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6408284902572632, \"percentile_inc_nulls\": 0.7390596866607666, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 531.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6376599073410034, \"percentile_inc_nulls\": 0.7367576360702515, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6345459222793579, \"percentile_inc_nulls\": 0.7344952821731567, \"value_count\": 57, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6319235563278198, \"percentile_inc_nulls\": 0.7325901985168457, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.62848961353302, \"percentile_inc_nulls\": 0.730095386505127, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6268038153648376, \"percentile_inc_nulls\": 0.7288706302642822, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6230810880661011, \"percentile_inc_nulls\": 0.7261660099029541, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6214576959609985, \"percentile_inc_nulls\": 0.7249866724014282, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6182734966278076, \"percentile_inc_nulls\": 0.7226732969284058, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.614761471748352, \"percentile_inc_nulls\": 0.7201218008995056, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6124669313430786, \"percentile_inc_nulls\": 0.7184548377990723, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6079715490341187, \"percentile_inc_nulls\": 0.7151888608932495, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6028361320495605, \"percentile_inc_nulls\": 0.7114579677581787, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5931429862976074, \"percentile_inc_nulls\": 0.7044157981872559, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5896309614181519, \"percentile_inc_nulls\": 0.7018643021583557, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5868837237358093, \"percentile_inc_nulls\": 0.6998684406280518, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5818498134613037, \"percentile_inc_nulls\": 0.6962112784385681, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 645.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.578244149684906, \"percentile_inc_nulls\": 0.6935917139053345, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5718444585800171, \"percentile_inc_nulls\": 0.6889423131942749, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5674738883972168, \"percentile_inc_nulls\": 0.6857670545578003, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5622994899749756, \"percentile_inc_nulls\": 0.682007908821106, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5551818013191223, \"percentile_inc_nulls\": 0.6768368482589722, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 912.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5485401749610901, \"percentile_inc_nulls\": 0.6720116138458252, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 851.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5417970418930054, \"percentile_inc_nulls\": 0.6671127080917358, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 864.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5363339185714722, \"percentile_inc_nulls\": 0.6631436944007874, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.52757728099823, \"percentile_inc_nulls\": 0.6567819118499756, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1122.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5216536521911621, \"percentile_inc_nulls\": 0.6524783372879028, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 759.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5106648802757263, \"percentile_inc_nulls\": 0.6444950103759766, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5041325092315674, \"percentile_inc_nulls\": 0.6397491693496704, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 837.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4760362505912781, \"percentile_inc_nulls\": 0.6193370819091797, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3600.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4685673117637634, \"percentile_inc_nulls\": 0.6139108538627625, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 957.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4598262906074524, \"percentile_inc_nulls\": 0.607560396194458, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1120.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4490794539451599, \"percentile_inc_nulls\": 0.5997527837753296, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1377.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.44238317012786865, \"percentile_inc_nulls\": 0.594887912273407, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 858.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.43360310792922974, \"percentile_inc_nulls\": 0.5885090827941895, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1125.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4227392077445984, \"percentile_inc_nulls\": 0.5806164741516113, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1392.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.32885873317718506, \"percentile_inc_nulls\": 0.5124117136001587, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12029.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.31237560510635376, \"percentile_inc_nulls\": 0.5004366040229797, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2112.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.29729729890823364, \"percentile_inc_nulls\": 0.4894821047782898, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1932.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.28777581453323364, \"percentile_inc_nulls\": 0.48256468772888184, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1220.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.2762095332145691, \"percentile_inc_nulls\": 0.47416168451309204, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1482.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.26216137409210205, \"percentile_inc_nulls\": 0.4639556407928467, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.2504858374595642, \"percentile_inc_nulls\": 0.45547330379486084, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1496.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.23737424612045288, \"percentile_inc_nulls\": 0.44594764709472656, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1680.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.22718936204910278, \"percentile_inc_nulls\": 0.4385482668876648, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1305.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.20041990280151367, \"percentile_inc_nulls\": 0.4191000461578369, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3430.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.18875211477279663, \"percentile_inc_nulls\": 0.41062337160110474, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1495.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.1753596067428589, \"percentile_inc_nulls\": 0.40089356899261475, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1716.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.16196703910827637, \"percentile_inc_nulls\": 0.39116382598876953, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1716.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.15135294198989868, \"percentile_inc_nulls\": 0.3834525942802429, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1360.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.13386297225952148, \"percentile_inc_nulls\": 0.37074607610702515, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2241.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.12281179428100586, \"percentile_inc_nulls\": 0.36271733045578003, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1416.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.02824455499649048, \"percentile_inc_nulls\": 0.2940135598182678, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12117.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.021314144134521484, \"percentile_inc_nulls\": 0.28897857666015625, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 888.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.014524221420288086, \"percentile_inc_nulls\": 0.2840456962585449, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 870.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.008499085903167725, \"percentile_inc_nulls\": 0.279668390750885, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 772.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.004823207855224609, \"percentile_inc_nulls\": 0.27699780464172363, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.0018886923789978027, \"percentile_inc_nulls\": 0.27486592531204224, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.27349376678466797, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2805, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2805.0, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 48,235 values (27.3%) are null and there are 6384 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2805, \"group_name\": \"_zip_code_\", \"value\": \"33408\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1827, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 956, \"group_name\": \"_zip_code_\", \"value\": \"27517\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 876, \"group_name\": \"_zip_code_\", \"value\": \"01810\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 867, \"group_name\": \"_zip_code_\", \"value\": \"77056\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 697, \"group_name\": \"_zip_code_\", \"value\": \"68154\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 658, \"group_name\": \"_zip_code_\", \"value\": \"28801\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 626, \"group_name\": \"_zip_code_\", \"value\": \"94104\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 588, \"group_name\": \"_zip_code_\", \"value\": \"28202\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 569, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"00222\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"91377\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"44130\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"02321\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"56142\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2805]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 211,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "profile_columns(eia_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69f5fc54-f479-495c-86fc-48accda883d0",
+   "metadata": {},
+   "source": [
+    "## Blocking"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 300,
+   "id": "6402e556-b87c-47ca-bc30-ced2b42e6626",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "br0 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n",
+    "br1 = \"l.report_year = r.report_year and l.street_address = r.street_address\"\n",
+    "# br2 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city\"\n",
+    "br4 = \"l.report_year = r.report_year and l.phone_number = r.phone_number\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 257,
+   "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'number_of_comparisons_generated_pre_filter_conditions': 618634,\n",
+       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 618634,\n",
+       " 'filter_conditions_identified': '',\n",
+       " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
+       " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
+      ]
+     },
+     "execution_count": 257,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "counts = count_comparisons_from_blocking_rule(\n",
+    "    table_or_tables=[sec_match_df, eia_match_df],\n",
+    "    blocking_rule=br0,\n",
+    "    link_type=\"link_only\",\n",
+    "    unique_id_column_name='record_id',\n",
+    "    db_api=db_api,\n",
+    ")\n",
+    "\n",
+    "counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>key_0</th>\n",
+       "      <th>key_1</th>\n",
+       "      <th>key_2</th>\n",
+       "      <th>count_l</th>\n",
+       "      <th>count_r</th>\n",
+       "      <th>block_count</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2023</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>02110</td>\n",
+       "      <td>113</td>\n",
+       "      <td>134</td>\n",
+       "      <td>15142</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2022</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>02110</td>\n",
+       "      <td>116</td>\n",
+       "      <td>110</td>\n",
+       "      <td>12760</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2021</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>02110</td>\n",
+       "      <td>113</td>\n",
+       "      <td>88</td>\n",
+       "      <td>9944</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   key_0   key_1  key_2  count_l  count_r  block_count\n",
+       "0   2023  boston  02110      113      134        15142\n",
+       "1   2022  boston  02110      116      110        12760\n",
+       "2   2021  boston  02110      113       88         9944"
+      ]
+     },
+     "execution_count": 259,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = n_largest_blocks(\n",
+    "    table_or_tables=[sec_match_df, eia_match_df],\n",
+    "    blocking_rule=br3,\n",
+    "    link_type=\"link_only\",\n",
+    "    db_api=db_api,\n",
+    "    n_largest=3\n",
+    ")\n",
+    "\n",
+    "result.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 302,
+   "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed details,\n",
+       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-bc6bb82997e900308036d5ce309e7401\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-bc6bb82997e900308036d5ce309e7401\": [{\"blocking_rule\": \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\", \"row_count\": 2371618, \"cumulative_rows\": 2371618, \"cartesian\": 40620617120, \"match_key\": \"0\", \"start\": 0}, {\"blocking_rule\": \"l.report_year = r.report_year and l.street_address = r.street_address\", \"row_count\": 7101, \"cumulative_rows\": 2378719, \"cartesian\": 40620617120, \"match_key\": \"1\", \"start\": 2371618}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.Chart(...)"
+      ]
+     },
+     "execution_count": 302,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "blocking_rules_for_analysis = [\n",
+    "    br0, br1\n",
+    "]\n",
+    "\n",
+    "\n",
+    "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
+    "    table_or_tables=[sec_match_df, eia_match_df],\n",
+    "    blocking_rules=blocking_rules_for_analysis,\n",
+    "    db_api=db_api,\n",
+    "    unique_id_column_name='record_id',\n",
+    "    link_type=\"link_only\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "377b0017-e46f-4d06-8cb5-af2b7725fc0e",
+   "metadata": {},
+   "source": [
+    "## Create Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 382,
+   "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'JaccardAtThresholds' of \"company_name\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
+      "    - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
+      "    - 'Jaccard distance of 'company_name >= 0.9'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.9\n",
+      "    - 'Jaccard distance of 'company_name >= 0.7'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.7\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# company_name_comparison = cl.NameComparison(\n",
+    "#     \"company_name\",\n",
+    "    # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n",
+    "# )\n",
+    "company_name_comparison = cl.JaccardAtThresholds(\n",
+    "     \"company_name\",\n",
+    "    # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n",
+    ")\n",
+    "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 373,
+   "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'LevenshteinAtThresholds' of \"street_address\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n",
+      "    - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n",
+      "    - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n",
+      "    - 'Levenshtein distance of street_address <= 2' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 2\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "address_comparison = cl.LevenshteinAtThresholds(\n",
+    "    \"street_address\",\n",
+    "    # size_threshold_or_thresholds=[1,2,3]\n",
+    ")\n",
+    "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 267,
+   "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 268,
+   "id": "974a3982-38a1-45cb-9875-b8d4584c808d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_comparison = cl.ExactMatch(\"state\").configure(term_frequency_adjustments=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 269,
+   "id": "7592619b-340a-4496-8195-9ce932cae699",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'NameComparison' of \"city\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n",
+      "    - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n",
+      "    - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity(\"city_l\", \"city_r\") >= 0.9\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "city_comparison = cl.NameComparison(\n",
+    "    \"city\",\n",
+    "    jaro_winkler_thresholds=[0.9]\n",
+    "    # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n",
+    ")\n",
+    "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 383,
+   "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "settings = SettingsCreator(\n",
+    "    link_type=\"link_only\",\n",
+    "    unique_id_column_name=\"record_id\",\n",
+    "    comparisons=[\n",
+    "        company_name_comparison,\n",
+    "        address_comparison,\n",
+    "        zip_code_comparison,\n",
+    "        state_comparison,\n",
+    "        city_comparison\n",
+    "    ],\n",
+    "    blocking_rules_to_generate_predictions=[\n",
+    "        br0, br1\n",
+    "    ],\n",
+    "    retain_intermediate_calculation_columns=True,\n",
+    ")\n",
+    "\n",
+    "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 384,
+   "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "358d0a088e2441deaef798c55ad97068",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Probability two random records match is estimated to be  2.18e-05.\n",
+      "This means that amongst all possible pairwise record comparisons, one in 45,828.17 are expected to match.  With 40,620,617,120 total possible comparisons, we expect a total of around 886,367.78 matching pairs\n"
+     ]
+    }
+   ],
+   "source": [
+    "deterministic_rules = [\n",
+    "    block_on(\"company_name\", \"company_name\"),\n",
+    "    block_on(\"phone_number\"),\n",
+    "    block_on(\"street_address\"),\n",
+    "    \"jaccard(r.company_name, l.company_name) >= .9 and l.city = r.city\",\n",
+    "    \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city\",\n",
+    "]\n",
+    "\n",
+    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 385,
+   "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "----- Estimating u probabilities using random sampling -----\n",
+      "\n",
+      "Estimated u probabilities using random sampling\n",
+      "\n",
+      "Your model is not yet fully trained. Missing estimates for:\n",
+      "    - company_name (no m values are trained).\n",
+      "    - street_address (no m values are trained).\n",
+      "    - zip_code (no m values are trained).\n",
+      "    - state (no m values are trained).\n",
+      "    - city (no m values are trained).\n"
+     ]
+    }
+   ],
+   "source": [
+    "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 386,
+   "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "----- Starting EM training session -----\n",
+      "\n",
+      "Estimating the m probabilities of the model by blocking on:\n",
+      "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n",
+      "\n",
+      "Parameter estimates will be made for the following comparison(s):\n",
+      "    - street_address\n",
+      "    - zip_code\n",
+      "    - state\n",
+      "    - city\n",
+      "\n",
+      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+      "    - company_name\n",
+      "\n",
+      "Iteration 1: Largest change in params was 0.804 in the m_probability of street_address, level `All other comparisons`\n",
+      "Iteration 2: Largest change in params was 0.0737 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 3: Largest change in params was -0.039 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 4: Largest change in params was 0.021 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 5: Largest change in params was 0.00805 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 6: Largest change in params was -0.00338 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 7: Largest change in params was 0.00164 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 8: Largest change in params was 0.000825 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 9: Largest change in params was -0.000425 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 10: Largest change in params was -0.000223 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 11: Largest change in params was 0.000118 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 12: Largest change in params was 6.29e-05 in the m_probability of state, level `Exact match on state`\n",
+      "\n",
+      "EM converged after 12 iterations\n",
+      "\n",
+      "Your model is not yet fully trained. Missing estimates for:\n",
+      "    - company_name (no m values are trained).\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_blocking_rule = block_on(\"company_name\", \"company_name\")\n",
+    "training_session_fname_sname = (\n",
+    "    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 387,
+   "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "----- Starting EM training session -----\n",
+      "\n",
+      "Estimating the m probabilities of the model by blocking on:\n",
+      "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n",
+      "\n",
+      "Parameter estimates will be made for the following comparison(s):\n",
+      "    - company_name\n",
+      "    - zip_code\n",
+      "    - state\n",
+      "    - city\n",
+      "\n",
+      "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
+      "    - street_address\n",
+      "\n",
+      "Iteration 1: Largest change in params was -0.929 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 2: Largest change in params was 0.0355 in probability_two_random_records_match\n",
+      "Iteration 3: Largest change in params was 0.00843 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 4: Largest change in params was -0.00612 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 5: Largest change in params was -0.00431 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 6: Largest change in params was -0.00301 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 7: Largest change in params was 0.0021 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 8: Largest change in params was -0.00146 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 9: Largest change in params was 0.00101 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 10: Largest change in params was -0.000704 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 11: Largest change in params was 0.000489 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 12: Largest change in params was -0.00034 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 13: Largest change in params was -0.000236 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 14: Largest change in params was 0.000164 in the m_probability of state, level `All other comparisons`\n",
+      "Iteration 15: Largest change in params was -0.000114 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 16: Largest change in params was -7.88e-05 in the m_probability of state, level `Exact match on state`\n",
+      "\n",
+      "EM converged after 16 iterations\n",
+      "\n",
+      "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_blocking_rule = block_on(\"street_address\", \"street_address\")\n",
+    "training_session_fname_sname = (\n",
+    "    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 388,
+   "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed details,\n",
+       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-683c397c94694591a2af8e121ffd957d\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-683c397c94694591a2af8e121ffd957d\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-683c397c94694591a2af8e121ffd957d\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-16, 16]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-16, 16]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-f89554aea166da6e147a98b6901fa5cf\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-f89554aea166da6e147a98b6901fa5cf\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.1821114058696376e-05, \"log2_bayes_factor\": -15.483915715308404, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  45,828.2 records.This is equivalent to a starting match weight of -15.484.\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.019774989606985885, \"u_probability\": 2.027918645697988e-06, \"m_probability_description\": \"Amongst matching record comparisons, 1.977% of records (i.e. one in 50.57) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0002028% of records (i.e. one in 493,116) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9751.372250033997, \"log2_bayes_factor\": 13.251389539320668, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 9,751 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.9\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.9'\", \"m_probability\": 0.01846534387207393, \"u_probability\": 0.0019522096829252629, \"m_probability_description\": \"Amongst matching record comparisons, 1.847% of records (i.e. one in 54.16) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.1952% of records (i.e. one in 512) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9.458688804577989, \"log2_bayes_factor\": 3.241640206160788, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.9'` then comparison is 9.459 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.7'\", \"m_probability\": 0.2788273861192321, \"u_probability\": 0.19060658426557237, \"m_probability_description\": \"Amongst matching record comparisons, 27.88% of records (i.e. one in 3.586) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 19.06% of records (i.e. one in 5.246) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1.462842362941364, \"log2_bayes_factor\": 0.5487743118852083, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.7'` then comparison is 1.463 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.6829322804017081, \"u_probability\": 0.8074391781328567, \"m_probability_description\": \"Amongst matching record comparisons, 68.29% of records (i.e. one in 1.464) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 80.74% of records (i.e. one in 1.238) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8458002768467817, \"log2_bayes_factor\": -0.2416110622985932, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.182 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.13355154452924042, \"u_probability\": 5.3722899239283746e-06, \"m_probability_description\": \"Amongst matching record comparisons, 13.36% of records (i.e. one in 7.488) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005372% of records (i.e. one in 186,140) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 24859.333062870825, \"log2_bayes_factor\": 14.601499971201484, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 24,859 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.012195893520133996, \"u_probability\": 1.4814496456893396e-05, \"m_probability_description\": \"Amongst matching record comparisons, 1.22% of records (i.e. one in 81.99) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001481% of records (i.e. one in 67,501) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 823.2405033556894, \"log2_bayes_factor\": 9.685170154213719, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 823 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 2\", \"label_for_charts\": \"Levenshtein distance of street_address <= 2\", \"m_probability\": 0.01029028015491091, \"u_probability\": 0.00012974894149608832, \"m_probability_description\": \"Amongst matching record comparisons, 1.029% of records (i.e. one in 97.18) are in the levenshtein distance of street_address <= 2 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.01297% of records (i.e. one in 7,707) are in the levenshtein distance of street_address <= 2 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 79.30916457781771, \"log2_bayes_factor\": 6.309415681207828, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 2` then comparison is 79.31 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8439622817957148, \"u_probability\": 0.9998500642721231, \"m_probability_description\": \"Amongst matching record comparisons, 84.4% of records (i.e. one in 1.185) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.99% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8440888408704634, \"log2_bayes_factor\": -0.2445332434217164, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.185 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"\\\"zip_code_l\\\" = \\\"zip_code_r\\\"\", \"label_for_charts\": \"Exact match on zip_code\", \"m_probability\": 0.5972484099495287, \"u_probability\": 0.0004124676922519243, \"m_probability_description\": \"Amongst matching record comparisons, 59.72% of records (i.e. one in 1.674) are in the exact match on zip_code comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04125% of records (i.e. one in 2,424) are in the exact match on zip_code comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"zip_code\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1447.9883422838975, \"log2_bayes_factor\": 10.499834272030089, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on zip_code` then comparison is 1,448 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.40275159005047145, \"u_probability\": 0.9995875323077481, \"m_probability_description\": \"Amongst matching record comparisons, 40.28% of records (i.e. one in 2.483) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.96% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.4029177806176101, \"log2_bayes_factor\": -1.3114426223796678, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.482 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.9589078004241631, \"u_probability\": 0.04243320317142917, \"m_probability_description\": \"Amongst matching record comparisons, 95.89% of records (i.e. one in 1.043) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.243% of records (i.e. one in 23.57) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 22.598053617357085, \"log2_bayes_factor\": 4.498126612770657, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 22.6 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.04109219957583696, \"u_probability\": 0.9575667968285708, \"m_probability_description\": \"Amongst matching record comparisons, 4.109% of records (i.e. one in 24.34) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.76% of records (i.e. one in 1.044) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.04291314163349538, \"log2_bayes_factor\": -4.542436666382371, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 23.3 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.8925172913420765, \"u_probability\": 0.00414046488931094, \"m_probability_description\": \"Amongst matching record comparisons, 89.25% of records (i.e. one in 1.12) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.414% of records (i.e. one in 242) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 215.55968114744962, \"log2_bayes_factor\": 7.751943547604383, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 216 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02417396754770653, \"u_probability\": 0.00030828203643208597, \"m_probability_description\": \"Amongst matching record comparisons, 2.417% of records (i.e. one in 41.37) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.03083% of records (i.e. one in 3,244) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 78.41510270103596, \"log2_bayes_factor\": 6.2930596381757935, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 78.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.08330874111021702, \"u_probability\": 0.995551253074257, \"m_probability_description\": \"Amongst matching record comparisons, 8.331% of records (i.e. one in 12) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.56% of records (i.e. one in 1.004) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.08368101677634383, \"log2_bayes_factor\": -3.578955808442296, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 11.95 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 388,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "linker.visualisations.match_weights_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 389,
+   "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed details,\n",
+       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-3507ca9ecb389fb002b9f229324388dd\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-3507ca9ecb389fb002b9f229324388dd\": [{\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.019774989606985885, \"u_probability\": 2.027918645697988e-06, \"m_probability_description\": \"Amongst matching record comparisons, 1.977% of records (i.e. one in 50.57) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0002028% of records (i.e. one in 493,116) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9751.372250033997, \"log2_bayes_factor\": 13.251389539320668, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 9,751 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.9\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.9'\", \"m_probability\": 0.01846534387207393, \"u_probability\": 0.0019522096829252629, \"m_probability_description\": \"Amongst matching record comparisons, 1.847% of records (i.e. one in 54.16) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.1952% of records (i.e. one in 512) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9.458688804577989, \"log2_bayes_factor\": 3.241640206160788, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.9'` then comparison is 9.459 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.7'\", \"m_probability\": 0.2788273861192321, \"u_probability\": 0.19060658426557237, \"m_probability_description\": \"Amongst matching record comparisons, 27.88% of records (i.e. one in 3.586) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 19.06% of records (i.e. one in 5.246) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1.462842362941364, \"log2_bayes_factor\": 0.5487743118852083, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.7'` then comparison is 1.463 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.6829322804017081, \"u_probability\": 0.8074391781328567, \"m_probability_description\": \"Amongst matching record comparisons, 68.29% of records (i.e. one in 1.464) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 80.74% of records (i.e. one in 1.238) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8458002768467817, \"log2_bayes_factor\": -0.2416110622985932, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.182 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.13355154452924042, \"u_probability\": 5.3722899239283746e-06, \"m_probability_description\": \"Amongst matching record comparisons, 13.36% of records (i.e. one in 7.488) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005372% of records (i.e. one in 186,140) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 24859.333062870825, \"log2_bayes_factor\": 14.601499971201484, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 24,859 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.012195893520133996, \"u_probability\": 1.4814496456893396e-05, \"m_probability_description\": \"Amongst matching record comparisons, 1.22% of records (i.e. one in 81.99) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001481% of records (i.e. one in 67,501) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 823.2405033556894, \"log2_bayes_factor\": 9.685170154213719, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 823 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 2\", \"label_for_charts\": \"Levenshtein distance of street_address <= 2\", \"m_probability\": 0.01029028015491091, \"u_probability\": 0.00012974894149608832, \"m_probability_description\": \"Amongst matching record comparisons, 1.029% of records (i.e. one in 97.18) are in the levenshtein distance of street_address <= 2 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.01297% of records (i.e. one in 7,707) are in the levenshtein distance of street_address <= 2 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 79.30916457781771, \"log2_bayes_factor\": 6.309415681207828, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 2` then comparison is 79.31 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8439622817957148, \"u_probability\": 0.9998500642721231, \"m_probability_description\": \"Amongst matching record comparisons, 84.4% of records (i.e. one in 1.185) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.99% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8440888408704634, \"log2_bayes_factor\": -0.2445332434217164, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.185 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"\\\"zip_code_l\\\" = \\\"zip_code_r\\\"\", \"label_for_charts\": \"Exact match on zip_code\", \"m_probability\": 0.5972484099495287, \"u_probability\": 0.0004124676922519243, \"m_probability_description\": \"Amongst matching record comparisons, 59.72% of records (i.e. one in 1.674) are in the exact match on zip_code comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04125% of records (i.e. one in 2,424) are in the exact match on zip_code comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"zip_code\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1447.9883422838975, \"log2_bayes_factor\": 10.499834272030089, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on zip_code` then comparison is 1,448 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.40275159005047145, \"u_probability\": 0.9995875323077481, \"m_probability_description\": \"Amongst matching record comparisons, 40.28% of records (i.e. one in 2.483) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.96% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.4029177806176101, \"log2_bayes_factor\": -1.3114426223796678, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.482 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.9589078004241631, \"u_probability\": 0.04243320317142917, \"m_probability_description\": \"Amongst matching record comparisons, 95.89% of records (i.e. one in 1.043) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.243% of records (i.e. one in 23.57) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 22.598053617357085, \"log2_bayes_factor\": 4.498126612770657, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 22.6 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.04109219957583696, \"u_probability\": 0.9575667968285708, \"m_probability_description\": \"Amongst matching record comparisons, 4.109% of records (i.e. one in 24.34) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.76% of records (i.e. one in 1.044) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.04291314163349538, \"log2_bayes_factor\": -4.542436666382371, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 23.3 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.8925172913420765, \"u_probability\": 0.00414046488931094, \"m_probability_description\": \"Amongst matching record comparisons, 89.25% of records (i.e. one in 1.12) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.414% of records (i.e. one in 242) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 215.55968114744962, \"log2_bayes_factor\": 7.751943547604383, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 216 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02417396754770653, \"u_probability\": 0.00030828203643208597, \"m_probability_description\": \"Amongst matching record comparisons, 2.417% of records (i.e. one in 41.37) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.03083% of records (i.e. one in 3,244) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 78.41510270103596, \"log2_bayes_factor\": 6.2930596381757935, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 78.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.08330874111021702, \"u_probability\": 0.995551253074257, \"m_probability_description\": \"Amongst matching record comparisons, 8.331% of records (i.e. one in 12) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.56% of records (i.e. one in 1.004) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.08368101677634383, \"log2_bayes_factor\": -3.578955808442296, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 11.95 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.HConcatChart(...)"
+      ]
+     },
+     "execution_count": 389,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# company_name doesn't look good here\n",
+    "linker.visualisations.m_u_parameters_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 285,
+   "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "settings = linker.misc.save_model_to_json(\n",
+    "    \"model_test.json\", overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b",
+   "metadata": {},
+   "source": [
+    "## Make Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 390,
+   "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Blocking time: 0.28 seconds\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1680da9f410c424d8e5648fc98c88022",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Predict time: 3.06 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_predictions = linker.inference.predict(threshold_match_probability=0.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 391,
+   "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_df = df_predictions.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 392,
+   "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_l</th>\n",
+       "      <th>company_name_r</th>\n",
+       "      <th>gamma_company_name</th>\n",
+       "      <th>bf_company_name</th>\n",
+       "      <th>street_address_l</th>\n",
+       "      <th>street_address_r</th>\n",
+       "      <th>gamma_street_address</th>\n",
+       "      <th>bf_street_address</th>\n",
+       "      <th>zip_code_l</th>\n",
+       "      <th>zip_code_r</th>\n",
+       "      <th>gamma_zip_code</th>\n",
+       "      <th>tf_zip_code_l</th>\n",
+       "      <th>tf_zip_code_r</th>\n",
+       "      <th>bf_zip_code</th>\n",
+       "      <th>bf_tf_adj_zip_code</th>\n",
+       "      <th>state_l</th>\n",
+       "      <th>state_r</th>\n",
+       "      <th>gamma_state</th>\n",
+       "      <th>tf_state_l</th>\n",
+       "      <th>tf_state_r</th>\n",
+       "      <th>bf_state</th>\n",
+       "      <th>bf_tf_adj_state</th>\n",
+       "      <th>city_l</th>\n",
+       "      <th>city_r</th>\n",
+       "      <th>gamma_city</th>\n",
+       "      <th>tf_city_l</th>\n",
+       "      <th>tf_city_r</th>\n",
+       "      <th>bf_city</th>\n",
+       "      <th>bf_tf_adj_city</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "      <th>report_year_l</th>\n",
+       "      <th>report_year_r</th>\n",
+       "      <th>match_key</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>11211</th>\n",
+       "      <td>0.054332</td>\n",
+       "      <td>0.509414</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>85762</td>\n",
+       "      <td>68295</td>\n",
+       "      <td>citi trends incorporated</td>\n",
+       "      <td>georgia pacific corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.462842</td>\n",
+       "      <td>104 coleman boulevard</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>31408</td>\n",
+       "      <td>31326</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>0.000103</td>\n",
+       "      <td>0.402918</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ga</td>\n",
+       "      <td>ga</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.023374</td>\n",
+       "      <td>0.023374</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>1.815434</td>\n",
+       "      <td>savannah</td>\n",
+       "      <td>savannah</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000454</td>\n",
+       "      <td>0.000454</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>9.129471</td>\n",
+       "      <td>ST TRNTS INKRPRTT</td>\n",
+       "      <td>JRJ PSFK KRPRXN</td>\n",
+       "      <td>2021</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11666</th>\n",
+       "      <td>0.098035</td>\n",
+       "      <td>0.516982</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>94615</td>\n",
+       "      <td>75114</td>\n",
+       "      <td>chicopee bancorp, incorporated</td>\n",
+       "      <td>chicopee city of</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.845800</td>\n",
+       "      <td>70 center street</td>\n",
+       "      <td>725 front street</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.844089</td>\n",
+       "      <td>01013</td>\n",
+       "      <td>01021</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.402918</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>0.987961</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>35.431042</td>\n",
+       "      <td>XKP BNKRP INKRPRTT</td>\n",
+       "      <td>XKP ST OF</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>2012</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11665</th>\n",
+       "      <td>0.098035</td>\n",
+       "      <td>0.516982</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>94614</td>\n",
+       "      <td>75115</td>\n",
+       "      <td>chicopee bancorp, incorporated</td>\n",
+       "      <td>chicopee city of</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.845800</td>\n",
+       "      <td>70 center street</td>\n",
+       "      <td>725 front street</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.844089</td>\n",
+       "      <td>01013</td>\n",
+       "      <td>01021</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.402918</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>0.987961</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>35.431042</td>\n",
+       "      <td>XKP BNKRP INKRPRTT</td>\n",
+       "      <td>XKP ST OF</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>2011</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11668</th>\n",
+       "      <td>0.098035</td>\n",
+       "      <td>0.516982</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>94618</td>\n",
+       "      <td>75118</td>\n",
+       "      <td>chicopee bancorp, incorporated</td>\n",
+       "      <td>chicopee city of</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.845800</td>\n",
+       "      <td>70 center street</td>\n",
+       "      <td>725 front street</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.844089</td>\n",
+       "      <td>01013</td>\n",
+       "      <td>01021</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.402918</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>0.987961</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>35.431042</td>\n",
+       "      <td>XKP BNKRP INKRPRTT</td>\n",
+       "      <td>XKP ST OF</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>2008</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11669</th>\n",
+       "      <td>0.098035</td>\n",
+       "      <td>0.516982</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>94620</td>\n",
+       "      <td>75116</td>\n",
+       "      <td>chicopee bancorp, incorporated</td>\n",
+       "      <td>chicopee city of</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.845800</td>\n",
+       "      <td>70 center street</td>\n",
+       "      <td>p o box 405</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.844089</td>\n",
+       "      <td>01013</td>\n",
+       "      <td>01021</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.402918</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>0.042950</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>0.987961</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>chicopee</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>0.000117</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>35.431042</td>\n",
+       "      <td>XKP BNKRP INKRPRTT</td>\n",
+       "      <td>XKP ST OF</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10043</th>\n",
+       "      <td>45.026591</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>177698</td>\n",
+       "      <td>67483</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>9751.372250</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>3</td>\n",
+       "      <td>24859.333063</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>1447.988342</td>\n",
+       "      <td>2.894003</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>15.835981</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>20.959208</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>2001</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10051</th>\n",
+       "      <td>45.026591</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>177702</td>\n",
+       "      <td>67479</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>9751.372250</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>3</td>\n",
+       "      <td>24859.333063</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>1447.988342</td>\n",
+       "      <td>2.894003</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>15.835981</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>20.959208</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>2005</td>\n",
+       "      <td>2005</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10050</th>\n",
+       "      <td>45.026591</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>177701</td>\n",
+       "      <td>67480</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>9751.372250</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>3</td>\n",
+       "      <td>24859.333063</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>1447.988342</td>\n",
+       "      <td>2.894003</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>15.835981</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>20.959208</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>2004</td>\n",
+       "      <td>2004</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10049</th>\n",
+       "      <td>45.026591</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>177699</td>\n",
+       "      <td>67482</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>9751.372250</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>3</td>\n",
+       "      <td>24859.333063</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>1447.988342</td>\n",
+       "      <td>2.894003</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>15.835981</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>20.959208</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>2002</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10035</th>\n",
+       "      <td>45.026591</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>177700</td>\n",
+       "      <td>67481</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>green mountain power corporation</td>\n",
+       "      <td>3</td>\n",
+       "      <td>9751.372250</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>163 acorn lane</td>\n",
+       "      <td>3</td>\n",
+       "      <td>24859.333063</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>05446</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>0.000143</td>\n",
+       "      <td>1447.988342</td>\n",
+       "      <td>2.894003</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>0.002680</td>\n",
+       "      <td>22.598054</td>\n",
+       "      <td>15.835981</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>0.000198</td>\n",
+       "      <td>215.559681</td>\n",
+       "      <td>20.959208</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>KRN MNTN PWR KRPRXN</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>2003</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>12713 rows × 40 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                    company_name_l                    company_name_r  gamma_company_name  bf_company_name       street_address_l  street_address_r  gamma_street_address  bf_street_address zip_code_l zip_code_r  gamma_zip_code  tf_zip_code_l  tf_zip_code_r  bf_zip_code  bf_tf_adj_zip_code state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state      city_l      city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l company_name_mphone_r  report_year_l  report_year_r match_key\n",
+       "11211      0.054332           0.509414  __splink__input_table_0  __splink__input_table_1        85762        68295          citi trends incorporated       georgia pacific corporation                   1         1.462842  104 coleman boulevard              None                    -1           1.000000      31408      31326               0       0.000045       0.000103     0.402918            1.000000      ga      ga            1    0.023374    0.023374  22.598054         1.815434    savannah    savannah           2   0.000454   0.000454  215.559681        9.129471     ST TRNTS INKRPRTT       JRJ PSFK KRPRXN           2021           2008         0\n",
+       "11666      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94615        75114    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2012           2012         0\n",
+       "11665      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94614        75115    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2011           2011         0\n",
+       "11668      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94618        75118    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2008           2008         0\n",
+       "11669      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94620        75116    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street       p o box 405                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2010           2010         0\n",
+       "...             ...                ...                      ...                      ...          ...          ...                               ...                               ...                 ...              ...                    ...               ...                   ...                ...        ...        ...             ...            ...            ...          ...                 ...     ...     ...          ...         ...         ...        ...              ...         ...         ...         ...        ...        ...         ...             ...                   ...                   ...            ...            ...       ...\n",
+       "10043     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177698        67483  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2001           2001         0\n",
+       "10051     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177702        67479  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2005           2005         0\n",
+       "10050     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177701        67480  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2004           2004         0\n",
+       "10049     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177699        67482  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2002           2002         0\n",
+       "10035     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177700        67481  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2003           2003         0\n",
+       "\n",
+       "[12713 rows x 40 columns]"
+      ]
+     },
+     "execution_count": 392,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df.sort_values(by=\"match_probability\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f63fb3d-5fac-476d-9271-347412121902",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mozilla_sec_eia",
+   "language": "python",
+   "name": "mozilla_sec_eia"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyproject.toml b/pyproject.toml
index c38ae7e..e0129cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ readme = {file = "README.rst", content-type = "text/x-rst"}
 authors = [
     {name = "Catalyst Cooperative", email = "pudl@catalyst.coop"}
 ]
-requires-python = ">=3.10,<3.12"
+requires-python = ">=3.10,<=3.12"
 dynamic = ["version"]
 license = {file = "LICENSE.txt"}
 dependencies = [
@@ -30,6 +30,7 @@ dependencies = [
     "google-cloud-secret-manager>=2,<3",
     "google-cloud-storage>=2,<3",
     "hypothesis",
+    "jellyfish>=1.1",
     "matplotlib>=3.8,<4",
     "mlflow>=2.12",
     "opencv-python",
@@ -62,6 +63,7 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 keywords = [
     "template",
@@ -95,7 +97,7 @@ docs = [
     "furo>=2022.4.7",
     "sphinx>=6,<8.1",  # The default Python documentation engine
     "sphinx-autoapi>=2,<4",  # Generates documentation from docstrings
-    "sphinx-issues>=1.2,<5",  # Allows references to GitHub issues
+    "sphinx-issues>=5",  # Allows references to GitHub issues
 
 ]
 tests = [
@@ -201,8 +203,8 @@ lint.ignore = [
     "EXE002",
 ]
 
-# Assume Python 3.11
-target-version = "py311"
+# Assume Python 3.12
+target-version = "py312"
 line-length = 88
 
 # Don't automatically concatenate strings -- sometimes we forget a comma!
@@ -231,6 +233,6 @@ inline-quotes = "double"
 multiline-quotes = "double"
 
 [tool.mypy]
-python_version = "3.10"
+python_version = "3.12"
 warn_return_any = true
 warn_unused_configs = true
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
new file mode 100644
index 0000000..0fec63c
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -0,0 +1 @@
+from . import preprocessing
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
new file mode 100644
index 0000000..9080cd7
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
@@ -0,0 +1,288 @@
+"""Preprocessing for EIA and SEC input data before record linkage."""
+
+import jellyfish
+import numpy as np
+import pandas as pd
+
+from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
+from pudl.analysis.record_linkage import name_cleaner
+
+EIA_COL_MAP = {
+    "utility_name_eia": "company_name",  # TODO: should be linking to owner or operator name?
+    "address_2": "street_address_2",
+}
+
+EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"}
+
+SEC_COL_MAP = {
+    "company_conformed_name": "company_name",
+    "street_1": "street_address",
+    "street_2": "street_address_2",
+    "zip": "zip_code",
+    "business_phone": "phone_number",
+    "date_filed": "report_date",
+}
+
+SHARED_COLS = [
+    "report_date",
+    "report_year",
+    "company_name",
+    "street_address",
+    "street_address_2",
+    "city",
+    "state",  # could use state of incorporation from SEC
+    "zip_code",
+    "phone_number",
+]
+
+STR_COLS = [
+    "company_name",
+    "street_address",
+    "street_address_2",
+    "city",
+    "state",
+    "zip_code",
+]
+
+INVALID_NAMES = [
+    "llc",
+    "limited liability company",
+    "limited",
+    "ltd",
+    "iiii",
+    "inc",
+    "incorporated",
+    "partnership",
+    "i",
+    "name",
+    "company",
+    "&",
+    "",
+]
+
+state_code_dict = {
+    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
+    "AK": "Alaska",
+    "AL": "Alabama",
+    "AR": "Arkansas",
+    "AZ": "Arizona",
+    "CA": "California",
+    "CO": "Colorado",
+    "CT": "Connecticut",
+    "DE": "Delaware",
+    "FL": "Florida",
+    "GA": "Georgia",
+    "HI": "Hawaii",
+    "IA": "Iowa",
+    "ID": "Idaho",
+    "IL": "Illinois",
+    "IN": "Indiana",
+    "KS": "Kansas",
+    "KY": "Kentucky",
+    "LA": "Louisiana",
+    "MA": "Massachusetts",
+    "MD": "Maryland",
+    "ME": "Maine",
+    "MI": "Michigan",
+    "MN": "Minnesota",
+    "MO": "Missouri",
+    "MS": "Mississippi",
+    "MT": "Montana",
+    "NC": "North Carolina",
+    "ND": "North Dakota",
+    "NE": "Nebraska",
+    "NH": "New Hampshire",
+    "NJ": "New Jersey",
+    "NM": "New Mexico",
+    "NV": "Nevada",
+    "NY": "New York",
+    "OH": "Ohio",
+    "OK": "Oklahoma",
+    "OR": "Oregon",
+    "PA": "Pennsylvania",
+    "RI": "Rhode Island",
+    "SC": "South Carolina",
+    "SD": "South Dakota",
+    "TN": "Tennessee",
+    "TX": "Texas",
+    "UT": "Utah",
+    "VA": "Virginia",
+    "VT": "Vermont",
+    "WA": "Washington",
+    "WI": "Wisconsin",
+    "WV": "West Virginia",
+    "WY": "Wyoming",
+    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
+    "DC": "District of Columbia",
+    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
+    "AS": "American Samoa",
+    "GU": "Guam GU",
+    "MP": "Northern Mariana Islands",
+    "PR": "Puerto Rico PR",
+    "VI": "U.S. Virgin Islands",
+}
+state_code_to_name = {k.lower(): v.lower() for k, v in state_code_dict.items()}
+
+company_name_cleaner = name_cleaner.CompanyNameCleaner(
+    cleaning_rules_list=[
+        "remove_word_the_from_the_end",
+        "remove_word_the_from_the_beginning",
+        "replace_amperstand_between_space_by_AND",
+        "replace_hyphen_by_space",
+        "replace_hyphen_between_spaces_by_single_space",
+        "replace_underscore_by_space",
+        "replace_underscore_between_spaces_by_single_space",
+        # "remove_all_punctuation",
+        # "remove_numbers",
+        # "remove_math_symbols",
+        "remove_words_in_parentheses",
+        "remove_parentheses",
+        "remove_brackets",
+        "remove_curly_brackets",
+        "enforce_single_space_between_words",
+    ]
+)
+
+
+def _add_report_year_to_sec(sec_df):
+    """Merge metadata on to get a report year for extracted SEC data.
+
+    Expects filename to be the index of the SEC dataframe.
+    """
+    archive = GCSArchive()
+    md = archive.get_metadata()
+    return sec_df.merge(
+        md[["date_filed"]], how="left", left_index=True, right_index=True
+    )
+
+
+# TODO: this is in PUDL, pull out into helper function
+def _get_metaphone(row, col_name):
+    if pd.isnull(row[col_name]):
+        return None
+    return jellyfish.metaphone(row[col_name])
+
+
+def _clean_company_name(df):
+    df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
+        df[["company_name"]]
+    )
+    df = df[df["company_name_clean"] != ""]
+    df = df.rename(columns={"company_name": "company_name_raw"}).rename(
+        columns={"company_name_clean": "company_name"}
+    )
+    return df
+
+
+def clean_sec_df(df):
+    """Shared cleaning for SEC 10K and Ex. 21 dataframes.
+
+    Arguments:
+        df: Ex. 21 or SEC 10K basic info dataframe with columns
+        company_name, loc_of_incorporation, and report_year.
+    """
+    df[["company_name", "loc_of_incorporation"]] = (
+        df[["company_name", "loc_of_incorporation"]]
+        .fillna(pd.NA)
+        .apply(lambda x: x.str.strip().str.lower())
+    )
+    df.loc[:, "company_name"] = df["company_name"].replace("", pd.NA)
+    df.loc[:, "loc_of_incorporation"] = df["loc_of_incorporation"].replace("", pd.NA)
+    df = _clean_company_name(df)
+    df = df[
+        (~df["company_name"].isin(INVALID_NAMES))
+        & ~(df["company_name_raw"].isin(INVALID_NAMES))
+    ]
+    df = df.fillna(np.nan)
+    df = df.drop_duplicates(
+        subset=["company_name", "loc_of_incorporation", "report_year"]
+    )
+    return df
+
+
+def _remove_weird_sec_cols(sec_df):
+    for weird_col in ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]:
+        if weird_col not in sec_df:
+            continue
+        normal_col = weird_col[1:]
+        sec_df.loc[:, normal_col] = sec_df[normal_col].where(
+            sec_df[weird_col].isnull(), sec_df[weird_col]
+        )
+    return sec_df
+
+
+# TODO: for now split these into separate cleaning functions
+# later unite them into one cleaning function
+def prepare_sec10k_basic_info_df(sec_df):
+    """Preprocess SEC 10k basic information dataframe for record linkage."""
+    sec_df = _add_report_year_to_sec(sec_df)
+    sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index()
+    sec_df.loc[:, "report_year"] = (
+        sec_df["report_date"].astype("datetime64[ns]").dt.year
+    )
+    sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
+        state_code_to_name
+    )
+    # TODO: maybe shouldn't expand the state names and comparison should
+    # just be an exact match or nothing?
+    # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name)
+    # TODO: needs a record_id_sec column?
+    # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"})
+    sec_df = _remove_weird_sec_cols(sec_df)
+    sec_df = clean_sec_df(sec_df)
+    sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
+    sec_df.loc[:, "company_name_mphone"] = sec_df.apply(
+        _get_metaphone, axis=1, args=("company_name",)
+    )
+    sec_df = sec_df.reset_index(names="record_id")
+    return sec_df
+
+
+def prepare_ex21_df(ex21_df):
+    """Preprocess Ex. 21 extracted dataframe for record linkage."""
+    ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
+    # TODO: move this to general preprocessing function?
+    ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace(
+        state_code_to_name
+    )
+    ex21_df = clean_sec_df(ex21_df)
+    ex21_df.loc[:, "company_name_mphone"] = ex21_df.apply(
+        _get_metaphone, axis=1, args=("company_name",)
+    )
+    ex21_df = ex21_df.reset_index(names="record_id")
+    return ex21_df
+
+
+def prepare_eia_df(eia_df):
+    """Preprocess EIA utility dataframe for record linkage."""
+    eia_df = eia_df.rename(columns=EIA_COL_MAP)
+    eia_df.loc[:, "report_year"] = (
+        eia_df["report_date"].astype("datetime64[ns]").dt.year
+    )
+    eia_df = eia_df.fillna(np.nan)
+    eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
+    eia_df = _clean_company_name(eia_df)
+    eia_df.loc[:, "company_name_mphone"] = eia_df.apply(
+        _get_metaphone, axis=1, args=("company_name",)
+    )
+    eia_df = eia_df.reset_index(names="record_id")
+    return eia_df
+
+
+"""
+def preprocessing(eia_df, sec_df):
+    # TODO: reorganize to be more similar to ferc to eia match structure
+    eia_df = eia_df.rename(columns=EIA_COL_MAP)
+
+    # TODO: fill out this prepare for matching function
+    # eia_df = prepare_for_matching(eia_df)
+    # sec_df = prepare_for_matching(sec_df)
+    sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
+        state_code_to_name
+    )
+    sec_df.loc[:, "loc_of_incorporation"] = sec_df["loc_of_incorporation"].where(
+        ~sec_df["loc_of_incorporation"].isnull(), sec_df["city"]
+    )
+    sec_df = sec_df.rename(columns={"record_id_sec": "record_id"})
+    eia_df = eia_df.rename(columns={"record_id_eia": "record_id"})
+"""

From 2dbdcaad4db68711985635f4cbc20f05b678bac8 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Tue, 22 Oct 2024 14:16:32 -0700
Subject: [PATCH 125/161] clean up feature creation in paragraph classifier

---
 .../exhibit21_layout_classifier.ipynb         | 22 +++++++------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
index 8315fc1..a45c2e3 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "ee4ed368-7d01-4cb8-952f-f7941900d669",
    "metadata": {
     "tags": []
@@ -71,14 +71,9 @@
     "    \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n",
     "    df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n",
     "    features = {}\n",
-    "    features[\"n_bboxes\"] = len(df)\n",
-    "\n",
-    "    # block density wasn't a very useful feature, maybe rework?\n",
-    "    # Calculate the bounding box density of the area of the page with text\n",
-    "    # x_width = df[\"bottom_right_x_pdf\"].max() - df[\"top_left_x_pdf\"].min()\n",
-    "    # y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n",
-    "    # text_area = x_width * y_height\n",
-    "    # features[\"block_density\"] = features[\"n_bboxes\"] / text_area\n",
+    "    \n",
+    "    y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n",
+    "    features[\"block_y_density\"] = len(df) / y_height\n",
     "\n",
     "    # Calculate average y-distance between bounding boxes for a given document\n",
     "    df = df.sort_values(by=[\"top_left_y_pdf\", \"top_left_x_pdf\"])\n",
@@ -87,12 +82,11 @@
     "    features[\"std_y_distance\"] = y_diffs.std()\n",
     "\n",
     "    # Calculate x-distance to assess horizontal alignment\n",
-    "    x_diffs = df.groupby(\"top_left_y_pdf\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n",
-    "    features[\"avg_x_distance\"] = x_diffs.mean()\n",
-    "    features[\"std_x_distance\"] = x_diffs.std()\n",
+    "    x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n",
+    "    features['avg_x_distance'] = x_diffs.mean()\n",
     "\n",
     "    # Define a small threshold to group bounding boxes that are on the same line\n",
-    "    y_threshold = 0.1\n",
+    "    y_threshold = 0.5\n",
     "    df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n",
     "    boxes_per_line = df.groupby(\"line_group\").size()\n",
     "    features[\"median_boxes_per_line\"] = boxes_per_line.median()\n",
@@ -329,7 +323,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.12.0"
   }
  },
  "nbformat": 4,

From cda3225b885f6b705dea7d5e4fa66b20f89b1347 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Tue, 22 Oct 2024 14:49:56 -0700
Subject: [PATCH 126/161] fix feature creation function

---
 .../sec10k/notebooks/exhibit21_layout_classifier.ipynb     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
index a45c2e3..455910f 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -80,14 +80,15 @@
     "    y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n",
     "    features[\"avg_y_distance\"] = y_diffs.mean()\n",
     "    features[\"std_y_distance\"] = y_diffs.std()\n",
+    "    \n",
+    "    # Define a small threshold to group bounding boxes that are on the same line\n",
+    "    y_threshold = 0.5\n",
+    "    df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()\n",
     "\n",
     "    # Calculate x-distance to assess horizontal alignment\n",
     "    x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n",
     "    features['avg_x_distance'] = x_diffs.mean()\n",
     "\n",
-    "    # Define a small threshold to group bounding boxes that are on the same line\n",
-    "    y_threshold = 0.5\n",
-    "    df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n",
     "    boxes_per_line = df.groupby(\"line_group\").size()\n",
     "    features[\"median_boxes_per_line\"] = boxes_per_line.median()\n",
     "    return pd.Series(features)"

From 509b7a0c23cc72460e447ff3d879a541d78382fb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Oct 2024 22:28:18 +0000
Subject: [PATCH 127/161] [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci
---
 .../sec10k/notebooks/exhibit21_layout_classifier.ipynb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
index 455910f..a41f6d3 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb
@@ -71,7 +71,7 @@
     "    \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n",
     "    df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n",
     "    features = {}\n",
-    "    \n",
+    "\n",
     "    y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n",
     "    features[\"block_y_density\"] = len(df) / y_height\n",
     "\n",
@@ -80,14 +80,14 @@
     "    y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n",
     "    features[\"avg_y_distance\"] = y_diffs.mean()\n",
     "    features[\"std_y_distance\"] = y_diffs.std()\n",
-    "    \n",
+    "\n",
     "    # Define a small threshold to group bounding boxes that are on the same line\n",
     "    y_threshold = 0.5\n",
-    "    df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()\n",
+    "    df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n",
     "\n",
     "    # Calculate x-distance to assess horizontal alignment\n",
-    "    x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n",
-    "    features['avg_x_distance'] = x_diffs.mean()\n",
+    "    x_diffs = df.groupby(\"line_group\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n",
+    "    features[\"avg_x_distance\"] = x_diffs.mean()\n",
     "\n",
     "    boxes_per_line = df.groupby(\"line_group\").size()\n",
     "    features[\"median_boxes_per_line\"] = boxes_per_line.median()\n",

From 8855e5ed1451489aa7225cb1ffdcfd4607d9fc63 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Tue, 22 Oct 2024 15:45:19 -0700
Subject: [PATCH 128/161] small fixes to read in comments in tracking dataframe

---
 src/mozilla_sec_eia/library/validation_helpers.py      | 3 ++-
 src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py
index a05d4d0..adb278f 100644
--- a/src/mozilla_sec_eia/library/validation_helpers.py
+++ b/src/mozilla_sec_eia/library/validation_helpers.py
@@ -24,7 +24,8 @@ def load_validation_data(
 ) -> pd.DataFrame:
     """Load csv with validation data from `package_data` directory."""
     df = pd.read_csv(
-        resources.files("mozilla_sec_eia.package_data.validation_data") / filename
+        resources.files("mozilla_sec_eia.package_data.validation_data") / filename,
+        comment="#",
     )
     if index_cols is not None:
         df = df.set_index(index_cols)
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
index 5f79109..91617dd 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
@@ -86,8 +86,7 @@ def iob_to_label(label):
 
 
 def _is_cik_in_training_data(labeled_json_filename, tracking_df):
-    # TODO: for now CIK is stored as an int, update when fixed
-    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
+    cik = labeled_json_filename.split("/")[-1].split("-")[0]
     return cik in tracking_df.CIK.unique()
 
 

From 590ba60766b7b31501bb20c30e18329c6a499ef0 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Tue, 22 Oct 2024 22:18:33 -0700
Subject: [PATCH 129/161] updates to model pipeline

---
 .../models/sec10k/ex_21/data/common.py        |   8 +-
 .../notebooks/exhibit21_extractor.ipynb       | 129 ++++++------------
 2 files changed, 48 insertions(+), 89 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
index 91617dd..157b538 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
@@ -1,6 +1,7 @@
 """Implement methods used to construct both inference and training sets."""
 
 import json
+import logging
 import os
 from pathlib import Path
 
@@ -11,6 +12,8 @@
 
 from ...utils.pdf import get_pdf_data_from_path
 
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
 LABEL_PRIORITY = [
     "I-Subsidiary",
     "I-Loc",
@@ -86,7 +89,8 @@ def iob_to_label(label):
 
 
 def _is_cik_in_training_data(labeled_json_filename, tracking_df):
-    cik = labeled_json_filename.split("/")[-1].split("-")[0]
+    cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
+    logger.warning(f"CIK: {cik}")
     return cik in tracking_df.CIK.unique()
 
 
@@ -97,6 +101,7 @@ def format_label_studio_output(
     """Format Label Studio output JSONs into dataframe."""
     labeled_df = pd.DataFrame()
     tracking_df = validation_helpers.load_training_data("ex21_labels.csv")
+    logger.warning(f"tracking_df: {tracking_df.CIK.unique()}")
 
     for json_filename in os.listdir(labeled_json_dir):
         if not json_filename[0].isdigit() or json_filename.endswith(".json"):
@@ -105,6 +110,7 @@ def format_label_studio_output(
         with Path.open(json_file_path) as j:
             doc_dict = json.loads(j.read())
             filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
+            logger.warning(f"FILENAME: {filename}")
             # check if old local naming schema is being used
             if len(filename.split("-")) == 6:
                 filename = "-".join(filename.split("-")[2:])
diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
index 53e16c8..7e2852f 100644
--- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
+++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb
@@ -38,29 +38,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "48f185de-95ef-4194-9245-93f8d603d2e6",
    "metadata": {
     "tags": [
      "parameters"
     ]
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-16 17:11:06 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n",
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n",
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n",
-      "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n",
-      "2024-10-16 17:11:15 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import dagstermill\n",
     "\n",
@@ -443,14 +428,13 @@
     "                \"doc_dict\": model_inputs[\"doc_dict\"],\n",
     "            }\n",
     "\n",
-    "    def postprocess(self, all_outputs):\n",
+    "    def postprocess(self, output_dict):\n",
     "        \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n",
-    "        logits = all_outputs[\"logits\"]\n",
-    "        predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n",
-    "        output_df = self.extract_table(all_outputs)\n",
-    "        return logits, predictions, output_df\n",
+    "        output_df = self.extract_table(output_dict)\n",
+    "        output_dict[\"output_df\"] = output_df\n",
+    "        return output_dict\n",
     "\n",
-    "    def extract_table(self, all_outputs):\n",
+    "    def extract_table(self, output_dict):\n",
     "        \"\"\"Extract a structured table from a set of inference predictions.\n",
     "\n",
     "        This function essentially works by stacking bounding boxes and predictions\n",
@@ -463,9 +447,9 @@
     "        \"\"\"\n",
     "        # TODO: when model more mature, break this into sub functions to make it\n",
     "        # clearer what's going on\n",
-    "        predictions = all_outputs[\"predictions\"]\n",
-    "        encoding = all_outputs[\"raw_encoding\"]\n",
-    "        doc_dict = all_outputs[\"doc_dict\"]\n",
+    "        predictions = output_dict[\"predictions\"]\n",
+    "        encoding = output_dict[\"raw_encoding\"]\n",
+    "        doc_dict = output_dict[\"doc_dict\"]\n",
     "\n",
     "        token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n",
     "        predictions_tensor = torch.tensor(predictions)\n",
@@ -496,6 +480,7 @@
     "        df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n",
     "            subset=BBOX_COLS + [\"pred\", \"word\"]\n",
     "        )\n",
+    "        df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n",
     "        # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n",
     "        # should always have a B entity label. Manually override labels so this is true.\n",
     "        first_in_group_df = df[\n",
@@ -541,69 +526,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "id": "4d802e00-1ca4-40b3-b15b-561711a9db70",
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d0779d02915a4503b0cd92d3df38cf88",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/10/16 17:11:20 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference\n"
+     "ename": "NameError",
+     "evalue": "name 'training_run_id' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex_21\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex21_validation_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m      8\u001b[0m     clean_extracted_df,\n\u001b[1;32m      9\u001b[0m )\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# If a model was trained in this notebook, use it. Otherwise, use\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m model_uri \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mruns:/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mtraining_run_id\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     13\u001b[0m model_info \u001b[38;5;241m=\u001b[39m mlflow\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mget_model_info(model_uri)\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_data\u001b[39m(dataset):\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'training_run_id' is not defined"
      ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "601bb4ae91dd4a218fe5be047f4829d0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n",
-      "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n",
-      "2024/10/16 17:11:51 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "68b8d5cef3a94294b243b6f0c3e8ee5f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -628,6 +566,14 @@
     "    \"\"\"Fill known nulls in location and own per column.\n",
     "\n",
     "    Fill with known values from rows with same subsidiary.\n",
+    "    If an extracted Ex. 21 table looks like the following:\n",
+    "\n",
+    "    subsidiary   loc       own_per\n",
+    "    Company A    NaN       NaN\n",
+    "    Company A    Delaware  50\n",
+    "\n",
+    "    Then fill in the first row with location and ownership\n",
+    "    percentage from the second row.\n",
     "    \"\"\"\n",
     "    if \"own_per\" in df:\n",
     "        df[\"own_per\"] = df.groupby([\"id\", \"subsidiary\"])[\"own_per\"].transform(\n",
@@ -672,9 +618,10 @@
     "        predictions = []\n",
     "        all_output_df = Ex21CompanyOwnership.example(size=0)\n",
     "        extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n",
-    "        for logit, pred, output_df in pipe(_get_data(dataset)):\n",
-    "            logits.append(logit)\n",
-    "            predictions.append(pred)\n",
+    "        for output_dict in pipe(_get_data(dataset)):\n",
+    "            logits.append(output_dict[\"logits\"])\n",
+    "            predictions.append(output_dict[\"predictions\"])\n",
+    "            output_df = output_dict[\"output_df\"]\n",
     "            if not output_df.empty:\n",
     "                filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n",
     "                extraction_metadata.loc[filename, [\"success\"]] = True\n",
@@ -684,7 +631,12 @@
     "        all_output_df = _fill_known_nulls(all_output_df)\n",
     "        all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]].drop_duplicates()\n",
     "        all_output_df = all_output_df.reset_index(drop=True)\n",
-    "        return extraction_metadata, all_output_df\n",
+    "        outputs_dict = {\n",
+    "            \"all_output_df\": all_output_df,\n",
+    "            \"logits\": logits,\n",
+    "            \"predictions\": predictions,\n",
+    "        }\n",
+    "        return extraction_metadata, outputs_dict\n",
     "\n",
     "# Save model to local temp dir with artifacts, then reload for evaluation\n",
     "with TemporaryDirectory() as tmp_dir:\n",
@@ -1096,7 +1048,8 @@
     ")\n",
     "\n",
     "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n",
-    "    metadata, extracted = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",
+    "    metadata, outputs_dict = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n",
+    "    extracted = outputs_dict[\"all_output_df\"]\n",
     "    metadata = pd.concat([ex21_failed_parsing_metadata, metadata])\n",
     "\n",
     "    jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, ex21_validation_set)\n",
@@ -1136,7 +1089,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.12.0"
   }
  },
  "nbformat": 4,

From 3db47d499e73eb73b5a0cf493f9bf2aaab21e5ed Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 23 Oct 2024 09:53:30 -0700
Subject: [PATCH 130/161] take out logging messages

---
 src/mozilla_sec_eia/library/validation_helpers.py      | 1 +
 src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py | 6 ------
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py
index adb278f..03044b2 100644
--- a/src/mozilla_sec_eia/library/validation_helpers.py
+++ b/src/mozilla_sec_eia/library/validation_helpers.py
@@ -92,6 +92,7 @@ def strip_down_company_names(ser: pd.Series) -> pd.Series:
 
     Used to compare subsidiary name columns during validation.
     """
+    # TODO: unify with PUDL
     # this JSON is taken from PUDL package data (used for CompanyNameCleaner)
     json_source = (
         resources.files("mozilla_sec_eia.package_data") / "us_legal_forms.json"
diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
index 157b538..08b0440 100644
--- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
+++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py
@@ -1,7 +1,6 @@
 """Implement methods used to construct both inference and training sets."""
 
 import json
-import logging
 import os
 from pathlib import Path
 
@@ -12,8 +11,6 @@
 
 from ...utils.pdf import get_pdf_data_from_path
 
-logger = logging.getLogger(f"catalystcoop.{__name__}")
-
 LABEL_PRIORITY = [
     "I-Subsidiary",
     "I-Loc",
@@ -90,7 +87,6 @@ def iob_to_label(label):
 
 def _is_cik_in_training_data(labeled_json_filename, tracking_df):
     cik = int(labeled_json_filename.split("/")[-1].split("-")[0])
-    logger.warning(f"CIK: {cik}")
     return cik in tracking_df.CIK.unique()
 
 
@@ -101,7 +97,6 @@ def format_label_studio_output(
     """Format Label Studio output JSONs into dataframe."""
     labeled_df = pd.DataFrame()
     tracking_df = validation_helpers.load_training_data("ex21_labels.csv")
-    logger.warning(f"tracking_df: {tracking_df.CIK.unique()}")
 
     for json_filename in os.listdir(labeled_json_dir):
         if not json_filename[0].isdigit() or json_filename.endswith(".json"):
@@ -110,7 +105,6 @@ def format_label_studio_output(
         with Path.open(json_file_path) as j:
             doc_dict = json.loads(j.read())
             filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0]
-            logger.warning(f"FILENAME: {filename}")
             # check if old local naming schema is being used
             if len(filename.split("-")) == 6:
                 filename = "-".join(filename.split("-")[2:])

From 61c8abf9ffa357483607334bb68e6b9ac1c13c87 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 23 Oct 2024 12:52:03 -0700
Subject: [PATCH 131/161] make pudl editable

---
 environment.yml                                             | 6 +++---
 .../models/sec_eia_record_linkage/preprocessing.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/environment.yml b/environment.yml
index 3ad1cd4..a902ea3 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,6 +29,6 @@ dependencies:
 
   # Use pip to install the package defined by this repo for development:
   - pip:
-    - git+https://github.com/catalyst-cooperative/pudl.git@main
-    # - -e /Users/katielamb/CatalystCoop/pudl[dev,docs,tests,types]
-    - --editable ./[dev,docs,tests,types]
+      # - git+https://github.com/catalyst-cooperative/pudl.git@main
+      - -e /Users/katielamb/CatalystCoop/pudl
+      - --editable ./[dev,docs,tests,types]
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
index 9080cd7..ebb7843 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
@@ -166,7 +166,7 @@ def _get_metaphone(row, col_name):
 def _clean_company_name(df):
     df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
         df[["company_name"]]
-    )
+    ).str.strip()
     df = df[df["company_name_clean"] != ""]
     df = df.rename(columns={"company_name": "company_name_raw"}).rename(
         columns={"company_name_clean": "company_name"}

From e5148d8c70477e158a154463185f2b9494f52fa8 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 27 Nov 2024 15:18:41 -0500
Subject: [PATCH 132/161] add in record linkage modules

---
 notebooks/16-kl-splink-ex21-filer-link.ipynb  | 5833 +++++++++++------
 notebooks/18-kl-splink-sec-eia.ipynb          | 4305 ++++++++----
 src/mozilla_sec_eia/models/sec10k/__init__.py |   10 +-
 .../models/sec10k/sec_output_table.py         |  327 +
 .../models/sec10k/utils/cloud.py              |   14 +
 .../create_eia_input.py                       |   76 +
 .../sec_eia_record_linkage/preprocessing.py   |  218 +-
 .../package_data/formDStateCodes.xsd.xml      |  328 +
 8 files changed, 8005 insertions(+), 3106 deletions(-)
 create mode 100644 src/mozilla_sec_eia/models/sec10k/sec_output_table.py
 create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py
 create mode 100644 src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml

diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb
index 2e656d3..efef952 100644
--- a/notebooks/16-kl-splink-ex21-filer-link.ipynb
+++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb
@@ -15,20 +15,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 260,
    "id": "e1222c94-36cd-4bae-95fb-089e5411e490",
    "metadata": {
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[autoreload of mozilla_sec_eia.models.sec10k.utils.cloud failed: Traceback (most recent call last):\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 274, in check\n",
+      "    superreload(m, reload, self.old_objects, self.shell)\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 500, in superreload\n",
+      "    update_generic(old_obj, new_obj)\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 397, in update_generic\n",
+      "    update(a, b)\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 335, in update_class\n",
+      "    if (old_obj == new_obj) is True:\n",
+      "        ^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 834, in __eq__\n",
+      "    return dict(self.items()) == dict(other.items())\n",
+      "           ^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 893, in __iter__\n",
+      "    for key in self._mapping:\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 46, in __iter__\n",
+      "    return self._get_built().__iter__()\n",
+      "           ^^^^^^^^^^^^^^^^^\n",
+      "  File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 57, in _get_built\n",
+      "    raise PydanticUserError(self._error_message, code=self._code)\n",
+      "pydantic.errors.PydanticUserError: Pydantic models should inherit from BaseModel, BaseModel cannot be instantiated directly\n",
+      "\n",
+      "For further information visit https://errors.pydantic.dev/2.9/u/base-model-instantiated\n",
+      "]\n"
+     ]
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "import pandas as pd\n",
     "from upath import UPath\n",
     "\n",
-    "# from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive\n",
-    "# from pudl.analysis.record_linkage import name_cleaner\n",
-    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df"
+    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n",
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df, add_sec_company_id_to_subsidiaries"
    ]
   },
   {
@@ -42,372 +72,296 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "67da3bf4-abbd-40c2-850b-1c73953625c8",
+   "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")"
+    "# for now try just training on 2023\n",
+    "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n",
+    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n",
+    "                       ]\n",
+    "                      )"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "28bdfdfd-beeb-4097-b4d3-b58a7c30f64d",
+   "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "eia_df = raw_eia_df.copy()"
+    "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
+    "raw_sec_df.columns.name = None"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "ee54bb48-cbe4-4261-9545-d4b2bdcb731e",
-   "metadata": {
-    "tags": []
-   },
+   "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")"
+    "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "8e69b4ba-8e7b-4d17-bc8c-a06f059f6015",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ce60f760-5b94-4889-92c5-ac0ed5cd6d82",
+   "id": "8e7a642d-7718-4101-b851-f1f4ee07180e",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "missing_utils = raw_eia861_df[~raw_eia861_df.utility_id_eia.isin(raw_eia_df.utility_id_eia.unique())].utility_id_eia.unique()"
+    "raw_ex21_df = pd.DataFrame()\n",
+    "for file in ex21_path.iterdir():\n",
+    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
+    "        report_year = file.name[:4]\n",
+    "        # for now just train with 2023\n",
+    "        if report_year != \"2023\":\n",
+    "            continue\n",
+    "        year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = report_year\n",
+    "        year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
+    "        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "a3ef2365-e459-44b3-94b0-77020cd606f2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a",
+   "metadata": {},
    "source": [
-    "harvested_df = pd.concat([\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "])"
+    "# Preprocessing"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "59fd9d69-b700-43ec-bb7a-f99eea1e0ec9",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": null,
+   "id": "39706c77-90db-4f49-8011-47a9777a88b6",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])"
+    "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "a47d17c1-0df1-412f-9687-3d540266f005",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 157,
+   "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:233: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "  )\n"
+     ]
+    }
+   ],
    "source": [
-    "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n",
-    "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n",
-    "                how=\"left\", \n",
-    "                left_on=[\"report_date\", \"utility_name_eia\"],\n",
-    "                right_on=[\"report_date\", \"new_parent\"]\n",
-    "               )\n",
-    "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n",
-    "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()"
+    "ex21_df = prepare_ex21_df(raw_ex21_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "fa6515b1-5012-4ec0-af96-f9fda11a9c5d",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 69,
+   "id": "34a86ec8-5b6c-4147-8f94-021fa271174c",
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>utility_id_eia</th>\n",
-       "      <th>state</th>\n",
-       "      <th>utility_name_eia</th>\n",
-       "      <th>new_parent</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>city</th>\n",
-       "      <th>merge_state</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>29933</th>\n",
-       "      <td>2009-01-01</td>\n",
-       "      <td>17698</td>\n",
-       "      <td>LA</td>\n",
-       "      <td>Southwestern Electric Power Co</td>\n",
-       "      <td>Southwestern Electric Power Co</td>\n",
-       "      <td>1 Riverside Plaza</td>\n",
-       "      <td>Columbus</td>\n",
-       "      <td>OH</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>33258</th>\n",
-       "      <td>2010-01-01</td>\n",
-       "      <td>17698</td>\n",
-       "      <td>AR</td>\n",
-       "      <td>Southwestern Electric Power Co</td>\n",
-       "      <td>Southwestern Electric Power Co</td>\n",
-       "      <td>1 Riverside Plaza</td>\n",
-       "      <td>Columbus</td>\n",
-       "      <td>OH</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>49001</th>\n",
-       "      <td>2015-01-01</td>\n",
-       "      <td>11788</td>\n",
-       "      <td>IA</td>\n",
-       "      <td>Consumers Energy</td>\n",
-       "      <td>Consumers Energy</td>\n",
-       "      <td>One Enrgy Plaza</td>\n",
-       "      <td>Jackson</td>\n",
-       "      <td>MI</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56853</th>\n",
-       "      <td>2017-01-01</td>\n",
-       "      <td>19157</td>\n",
-       "      <td>IA</td>\n",
-       "      <td>MiEnergy Cooperative</td>\n",
-       "      <td>MiEnergy Cooperative</td>\n",
-       "      <td>31110 Cooperative Way</td>\n",
-       "      <td>Rushford</td>\n",
-       "      <td>MN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>70820</th>\n",
-       "      <td>2021-01-01</td>\n",
-       "      <td>40165</td>\n",
-       "      <td>AZ</td>\n",
-       "      <td>Dixie Escalante R E A, Inc</td>\n",
-       "      <td>Dixie Escalante R E A, Inc</td>\n",
-       "      <td>495 N 3200 W</td>\n",
-       "      <td>Flowell</td>\n",
-       "      <td>UT</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "      report_date  utility_id_eia state                utility_name_eia                      new_parent         street_address      city merge_state\n",
-       "29933  2009-01-01           17698    LA  Southwestern Electric Power Co  Southwestern Electric Power Co      1 Riverside Plaza  Columbus          OH\n",
-       "33258  2010-01-01           17698    AR  Southwestern Electric Power Co  Southwestern Electric Power Co      1 Riverside Plaza  Columbus          OH\n",
-       "49001  2015-01-01           11788    IA                Consumers Energy                Consumers Energy        One Enrgy Plaza   Jackson          MI\n",
-       "56853  2017-01-01           19157    IA            MiEnergy Cooperative            MiEnergy Cooperative  31110 Cooperative Way  Rushford          MN\n",
-       "70820  2021-01-01           40165    AZ      Dixie Escalante R E A, Inc      Dixie Escalante R E A, Inc           495 N 3200 W   Flowell          UT"
+       "True"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 69,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "eia861_df[(eia861_df.state != eia861_df.merge_state) & (eia861_df.merge_state.notna())]"
+    "ex21_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "8ff7b788-5fef-4e88-94ff-89b25619aed8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 70,
+   "id": "505b0c45-1748-4517-8cac-d2acf2fa9037",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])"
+    "sec_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "17885342-b464-4f4d-ac75-b7be4d4ec7cc",
+   "execution_count": null,
+   "id": "11caf325-8530-430d-a3d2-a54043447021",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])"
+    "# sec_df has filename as unique ID\n",
+    "sec_df.filename.is_unique"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "fb71f68d-92da-468b-b8a5-02f5ba4b4459",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236",
+   "metadata": {},
    "source": [
-    "eia_df = pd.concat([eia_df, eia861_df])"
+    "Note: not removing paragraph layout docs, but maybe should"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "85402523-e28a-4410-b933-eb71572b9a00",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "6de284e1-2b76-418d-ac5e-9a84bd275c51",
+   "metadata": {},
    "source": [
-    "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")"
+    "# Try to just match on cleaned name and location"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "94e824d6-dd6a-47db-9447-3363e8d14fe0",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 170,
+   "id": "2c9a384d-a9e1-4e4a-829f-e92f1a007c90",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# not sure at what point this stops being a datetime\n",
-    "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")"
+    "sec_match_df = sec_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "56857668-ecd5-4c62-9286-e50c334750c5",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 179,
+   "id": "4bab406d-b1e0-495b-beee-90ae6b0c036b",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# there are nulls from non harvested 861 utilities\n",
-    "eia_df = eia_df.dropna(subset=\"utility_name_eia\")"
+    "merged_df = sec_match_df.merge(ex21_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
-   "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 185,
+   "id": "b8732fda-9f0a-412c-b7ba-8f307ee7b213",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0       florida\n",
+       "1      delaware\n",
+       "2      missouri\n",
+       "3      delaware\n",
+       "4           NaN\n",
+       "         ...   \n",
+       "515    delaware\n",
+       "516    delaware\n",
+       "517    delaware\n",
+       "518    delaware\n",
+       "519    delaware\n",
+       "Name: loc_of_incorporation_sec, Length: 520, dtype: object"
+      ]
+     },
+     "execution_count": 185,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# for now try just training on 2023\n",
-    "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n",
-    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n",
-    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n",
-    "                        pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n",
-    "                       ]\n",
-    "                      )"
+    "merged_df[\"loc_of_incorporation_sec\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 209,
+   "id": "3427d77c-3c3f-4a05-99db-7f96d3f0f193",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
-    "raw_sec_df.columns.name = None"
+    "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n",
+    "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n",
+    "merged_df[\"loc_overlap\"] = merged_df.apply(\n",
+    "    lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n",
+    ")\n",
+    "\n",
+    "# Select the row with the highest word overlap for each CIK and company name\n",
+    "closest_match = merged_df.loc[merged_df.groupby([\"central_index_key\", \"company_name\"])['loc_overlap'].idxmax()].reset_index(drop=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
-   "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e",
+   "execution_count": 210,
+   "id": "92cc6570-f34c-4782-9bbf-0cdeaf2ce044",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    480\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 210,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
+    "# this should be 0\n",
+    "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
-   "id": "8e7a642d-7718-4101-b851-f1f4ee07180e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 200,
+   "id": "d0c650d0-303d-43a4-9ae3-35c4fb6d481b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "central_index_key\n",
+       "False    480\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 200,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "raw_ex21_df = pd.DataFrame()\n",
-    "for file in ex21_path.iterdir():\n",
-    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
-    "        report_year = file.name[:4]\n",
-    "        # for now just train with 2023\n",
-    "        if report_year != \"2023\":\n",
-    "            continue\n",
-    "        year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
-    "        year_quarter_df.loc[:, \"report_year\"] = report_year\n",
-    "        year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
-    "        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
+    "# it's okay if there's duplication here, but not ideal\n",
+    "# multiple subsidiaries can point to the same CIK\n",
+    "closest_match.central_index_key.duplicated().value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "7daad7a6-c590-4324-9e31-2bb5c9fa4d6c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 201,
+   "id": "2b3a2c1f-7df4-4515-8727-a339303ebd4e",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -430,588 +384,570 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>utility_id_eia</th>\n",
-       "      <th>utility_id_pudl</th>\n",
-       "      <th>utility_name_eia</th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>street_address</th>\n",
+       "      <th>record_id_sec</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>phone_number</th>\n",
+       "      <th>central_index_key</th>\n",
        "      <th>city</th>\n",
+       "      <th>company_name_raw_sec</th>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <th>film_number</th>\n",
+       "      <th>fiscal_year_end</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>former_conformed_name</th>\n",
+       "      <th>irs_number</th>\n",
+       "      <th>organization_name</th>\n",
+       "      <th>sec_act</th>\n",
+       "      <th>sec_file_number</th>\n",
+       "      <th>standard_industrial_classification</th>\n",
        "      <th>state</th>\n",
+       "      <th>state_of_incorporation</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>street_address_2</th>\n",
        "      <th>zip_code</th>\n",
-       "      <th>plants_reported_owner</th>\n",
-       "      <th>plants_reported_operator</th>\n",
-       "      <th>...</th>\n",
-       "      <th>contact_lastname</th>\n",
-       "      <th>contact_title</th>\n",
-       "      <th>phone_number</th>\n",
-       "      <th>phone_extension</th>\n",
-       "      <th>contact_firstname_2</th>\n",
-       "      <th>contact_lastname_2</th>\n",
-       "      <th>contact_title_2</th>\n",
-       "      <th>phone_number_2</th>\n",
-       "      <th>phone_extension_2</th>\n",
-       "      <th>data_maturity</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year_sec</th>\n",
+       "      <th>loc_of_incorporation_sec</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>company_name_no_legal_sec</th>\n",
+       "      <th>company_name_mphone_sec</th>\n",
+       "      <th>record_id_ex21</th>\n",
+       "      <th>id</th>\n",
+       "      <th>company_name_raw_ex21</th>\n",
+       "      <th>loc_of_incorporation_ex21</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>report_year_ex21</th>\n",
+       "      <th>company_name_no_legal_ex21</th>\n",
+       "      <th>company_name_mphone_ex21</th>\n",
+       "      <th>loc_tokens_sec</th>\n",
+       "      <th>loc_tokens_ex21</th>\n",
+       "      <th>loc_overlap</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>33</th>\n",
-       "      <td>66292</td>\n",
-       "      <td>16386.0</td>\n",
-       "      <td>Desert Willow Energy Storage</td>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>100 Bayview Circle</td>\n",
-       "      <td>Newport Beach</td>\n",
-       "      <td>CA</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>provisional</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>66291</td>\n",
-       "      <td>16385.0</td>\n",
-       "      <td>Portage Solar Plant</td>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>N8917</td>\n",
-       "      <td>Portage</td>\n",
-       "      <td>WI</td>\n",
-       "      <td>53901</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>provisional</td>\n",
+       "      <th>0</th>\n",
+       "      <td>7990</td>\n",
+       "      <td>edgar/data/910638/0000910638-23-000009.txt</td>\n",
+       "      <td>8033263900</td>\n",
+       "      <td>0000910638</td>\n",
+       "      <td>rock hill</td>\n",
+       "      <td>3d systems corp</td>\n",
+       "      <td>19930816</td>\n",
+       "      <td>23738595</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>3 d systems corp</td>\n",
+       "      <td>954431352</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1934 act</td>\n",
+       "      <td>001-34220</td>\n",
+       "      <td>services-prepackaged software [7372]</td>\n",
+       "      <td>sc</td>\n",
+       "      <td>de</td>\n",
+       "      <td>333 three d systems circle</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>29730</td>\n",
+       "      <td>2023-03-16</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>3d systems corporation</td>\n",
+       "      <td>3d systems</td>\n",
+       "      <td>T SSTMS</td>\n",
+       "      <td>150739</td>\n",
+       "      <td>910638-0000910638-23-000009</td>\n",
+       "      <td>3d systems corporation</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>3d systems</td>\n",
+       "      <td>T SSTMS</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>37</th>\n",
-       "      <td>66290</td>\n",
-       "      <td>16384.0</td>\n",
-       "      <td>NSF Energy One LLC</td>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>1241 University Ave</td>\n",
-       "      <td>Rochester</td>\n",
-       "      <td>NY</td>\n",
-       "      <td>14607</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>provisional</td>\n",
+       "      <th>1</th>\n",
+       "      <td>7526</td>\n",
+       "      <td>edgar/data/824142/0000824142-23-000019.txt</td>\n",
+       "      <td>9185832266</td>\n",
+       "      <td>0000824142</td>\n",
+       "      <td>tulsa</td>\n",
+       "      <td>aaon, inc.</td>\n",
+       "      <td>19920703</td>\n",
+       "      <td>23675207</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>aaon inc</td>\n",
+       "      <td>870448736</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1934 act</td>\n",
+       "      <td>000-18953</td>\n",
+       "      <td>air cond &amp; warm air heating equip &amp; comm &amp; ind...</td>\n",
+       "      <td>ok</td>\n",
+       "      <td>nv</td>\n",
+       "      <td>2425 south yukon ave.</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>74107</td>\n",
+       "      <td>2023-02-27</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>aaon incorporated</td>\n",
+       "      <td>aaon</td>\n",
+       "      <td>N</td>\n",
+       "      <td>142821</td>\n",
+       "      <td>824142-0000824142-23-000019</td>\n",
+       "      <td>aaon, inc</td>\n",
+       "      <td>oklahoma</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>aaon</td>\n",
+       "      <td>N</td>\n",
+       "      <td>[nevada]</td>\n",
+       "      <td>[oklahoma]</td>\n",
+       "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>3 rows × 27 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "    utility_id_eia  utility_id_pudl              utility_name_eia report_date  \\\n",
-       "33           66292          16386.0  Desert Willow Energy Storage  2023-01-01   \n",
-       "35           66291          16385.0           Portage Solar Plant  2023-01-01   \n",
-       "37           66290          16384.0            NSF Energy One LLC  2023-01-01   \n",
-       "\n",
-       "         street_address           city state zip_code plants_reported_owner  \\\n",
-       "33   100 Bayview Circle  Newport Beach    CA     None                  None   \n",
-       "35                N8917        Portage    WI    53901                  None   \n",
-       "37  1241 University Ave      Rochester    NY    14607                  None   \n",
-       "\n",
-       "   plants_reported_operator  ... contact_lastname contact_title phone_number  \\\n",
-       "33                     None  ...             None          None         None   \n",
-       "35                     None  ...             None          None         None   \n",
-       "37                     None  ...             None          None         None   \n",
-       "\n",
-       "   phone_extension contact_firstname_2 contact_lastname_2 contact_title_2  \\\n",
-       "33            None                None               None            None   \n",
-       "35            None                None               None            None   \n",
-       "37            None                None               None            None   \n",
-       "\n",
-       "   phone_number_2 phone_extension_2 data_maturity  \n",
-       "33           None              None   provisional  \n",
-       "35           None              None   provisional  \n",
-       "37           None              None   provisional  \n",
-       "\n",
-       "[3 rows x 27 columns]"
+       "   record_id_sec                                    filename phone_number central_index_key       city company_name_raw_sec date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name   sec_act sec_file_number                 standard_industrial_classification state state_of_incorporation              street_address street_address_2 zip_code report_date  report_year_sec loc_of_incorporation_sec            company_name company_name_no_legal_sec company_name_mphone_sec  record_id_ex21                           id   company_name_raw_ex21 loc_of_incorporation_ex21 own_per  report_year_ex21 company_name_no_legal_ex21 company_name_mphone_ex21 loc_tokens_sec loc_tokens_ex21  loc_overlap\n",
+       "0           7990  edgar/data/910638/0000910638-23-000009.txt   8033263900        0000910638  rock hill      3d systems corp            19930816    23738595            1231      10-k      3 d systems corp  954431352               NaN  1934 act       001-34220               services-prepackaged software [7372]    sc                     de  333 three d systems circle              NaN    29730  2023-03-16             2023                 delaware  3d systems corporation                3d systems                 T SSTMS          150739  910638-0000910638-23-000009  3d systems corporation                  delaware     NaN              2023                 3d systems                  T SSTMS     [delaware]      [delaware]            1\n",
+       "1           7526  edgar/data/824142/0000824142-23-000019.txt   9185832266        0000824142      tulsa           aaon, inc.            19920703    23675207            1231      10-k              aaon inc  870448736               NaN  1934 act       000-18953  air cond & warm air heating equip & comm & ind...    ok                     nv       2425 south yukon ave.              NaN    74107  2023-02-27             2023                   nevada       aaon incorporated                      aaon                       N          142821  824142-0000824142-23-000019               aaon, inc                  oklahoma     NaN              2023                       aaon                        N       [nevada]      [oklahoma]            0"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 201,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "eia_df[(eia_df.street_address.notnull())].head(3)"
+    "closest_match.head(2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "id": "30c02757-45c0-403c-aa38-7422d3549a2b",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 241,
+   "id": "78dfc42c-3921-444e-8342-d34fc2fd1a7a",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "eia_subset = eia_df[eia_df.report_date == \"2020-01-01\"]"
+    "ex21_with_cik = ex21_df.merge(\n",
+    "    closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n",
+    "    how=\"left\",\n",
+    "    on=[\"company_name\", \"loc_of_incorporation\"],\n",
+    ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
-   "id": "1c0365a3-51d2-455b-8863-bc4dc22572f9",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 242,
+   "id": "1f4bca08-3a65-484d-ac6b-cb7d4584b4e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n",
+    "                                    how=\"left\",\n",
+    "                                    on=\"company_name\"\n",
+    "                                   ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 243,
+   "id": "5462d9bb-23dd-45fb-b5bf-35396caba399",
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>utility_id_eia</th>\n",
-       "      <th>utility_id_pudl</th>\n",
-       "      <th>utility_name_eia</th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>city</th>\n",
-       "      <th>state</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>plants_reported_owner</th>\n",
-       "      <th>plants_reported_operator</th>\n",
-       "      <th>...</th>\n",
-       "      <th>contact_lastname</th>\n",
-       "      <th>contact_title</th>\n",
-       "      <th>phone_number</th>\n",
-       "      <th>phone_extension</th>\n",
-       "      <th>contact_firstname_2</th>\n",
-       "      <th>contact_lastname_2</th>\n",
-       "      <th>contact_title_2</th>\n",
-       "      <th>phone_number_2</th>\n",
-       "      <th>phone_extension_2</th>\n",
-       "      <th>data_maturity</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>71566</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2010-01-01</td>\n",
-       "      <td>P O Box 1006</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28202</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71568</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2008-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71569</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2007-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Ashcraft</td>\n",
-       "      <td>Sr. Engineering Technologist</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Robert</td>\n",
-       "      <td>Mc Murry</td>\n",
-       "      <td>Dir Carolinas Integrated Resou</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71570</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2006-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Duckworth</td>\n",
-       "      <td>Planning Engineer</td>\n",
-       "      <td>704-382-4327</td>\n",
-       "      <td>382</td>\n",
-       "      <td>Steven</td>\n",
-       "      <td>Jester</td>\n",
-       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
-       "      <td>704-382-4887</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71571</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2005-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Duckworth</td>\n",
-       "      <td>Planning Engineer</td>\n",
-       "      <td>704-382-4327</td>\n",
-       "      <td>382</td>\n",
-       "      <td>Steven</td>\n",
-       "      <td>Jester</td>\n",
-       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
-       "      <td>704-382-4887</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71572</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2004-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Duckworth</td>\n",
-       "      <td>Planning Engineer</td>\n",
-       "      <td>704-382-4327</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Steven</td>\n",
-       "      <td>Jester</td>\n",
-       "      <td>Director, Rate Admn &amp; Cust Inq</td>\n",
-       "      <td>704-382-4887</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71573</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2003-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Duckworth</td>\n",
-       "      <td>Process Leader</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Steven</td>\n",
-       "      <td>Jester</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71574</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2002-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>Scott Henry</td>\n",
-       "      <td>Process Leader</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Mgr Reg Policy $ Res</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>71575</th>\n",
-       "      <td>5416</td>\n",
-       "      <td>90.0</td>\n",
-       "      <td>Duke Energy Corp</td>\n",
-       "      <td>2001-01-01</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Charlotte</td>\n",
-       "      <td>NC</td>\n",
-       "      <td>28201</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>R S Henry</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Mgr Operating Plann &amp; Analysis</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>9 rows × 27 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "       utility_id_eia  utility_id_pudl  utility_name_eia report_date  \\\n",
-       "71566            5416             90.0  Duke Energy Corp  2010-01-01   \n",
-       "71568            5416             90.0  Duke Energy Corp  2008-01-01   \n",
-       "71569            5416             90.0  Duke Energy Corp  2007-01-01   \n",
-       "71570            5416             90.0  Duke Energy Corp  2006-01-01   \n",
-       "71571            5416             90.0  Duke Energy Corp  2005-01-01   \n",
-       "71572            5416             90.0  Duke Energy Corp  2004-01-01   \n",
-       "71573            5416             90.0  Duke Energy Corp  2003-01-01   \n",
-       "71574            5416             90.0  Duke Energy Corp  2002-01-01   \n",
-       "71575            5416             90.0  Duke Energy Corp  2001-01-01   \n",
-       "\n",
-       "      street_address       city state zip_code plants_reported_owner  \\\n",
-       "71566   P O Box 1006  Charlotte    NC    28202                  None   \n",
-       "71568           None  Charlotte    NC    28201                  None   \n",
-       "71569           None  Charlotte    NC    28201                  None   \n",
-       "71570           None  Charlotte    NC    28201                  None   \n",
-       "71571           None  Charlotte    NC    28201                  None   \n",
-       "71572           None  Charlotte    NC    28201                  None   \n",
-       "71573           None  Charlotte    NC    28201                  None   \n",
-       "71574           None  Charlotte    NC    28201                  None   \n",
-       "71575           None  Charlotte    NC    28201                  None   \n",
-       "\n",
-       "      plants_reported_operator  ... contact_lastname  \\\n",
-       "71566                     None  ...             None   \n",
-       "71568                     None  ...             None   \n",
-       "71569                     None  ...         Ashcraft   \n",
-       "71570                     None  ...        Duckworth   \n",
-       "71571                     None  ...        Duckworth   \n",
-       "71572                     None  ...        Duckworth   \n",
-       "71573                     None  ...        Duckworth   \n",
-       "71574                     None  ...      Scott Henry   \n",
-       "71575                     None  ...        R S Henry   \n",
-       "\n",
-       "                      contact_title  phone_number phone_extension  \\\n",
-       "71566                          None          None            None   \n",
-       "71568                          None          None            None   \n",
-       "71569  Sr. Engineering Technologist          None            None   \n",
-       "71570             Planning Engineer  704-382-4327             382   \n",
-       "71571             Planning Engineer  704-382-4327             382   \n",
-       "71572             Planning Engineer  704-382-4327               0   \n",
-       "71573                Process Leader          None               0   \n",
-       "71574                Process Leader          None               0   \n",
-       "71575                          None          None               0   \n",
-       "\n",
-       "      contact_firstname_2 contact_lastname_2                 contact_title_2  \\\n",
-       "71566                None               None                            None   \n",
-       "71568                None               None                            None   \n",
-       "71569              Robert           Mc Murry  Dir Carolinas Integrated Resou   \n",
-       "71570              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
-       "71571              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
-       "71572              Steven             Jester  Director, Rate Admn & Cust Inq   \n",
-       "71573              Steven             Jester                            None   \n",
-       "71574                None               None            Mgr Reg Policy $ Res   \n",
-       "71575                None               None  Mgr Operating Plann & Analysis   \n",
-       "\n",
-       "      phone_number_2 phone_extension_2 data_maturity  \n",
-       "71566           None              None         final  \n",
-       "71568           None              None         final  \n",
-       "71569           None              None         final  \n",
-       "71570   704-382-4887              None         final  \n",
-       "71571   704-382-4887              None         final  \n",
-       "71572   704-382-4887              None         final  \n",
-       "71573           None              None         final  \n",
-       "71574           None              None         final  \n",
-       "71575           None              None         final  \n",
-       "\n",
-       "[9 rows x 27 columns]"
+       "subsidiary_cik\n",
+       "True     191387\n",
+       "False       480\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 58,
+     "execution_count": 243,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "eia_df[(eia_df.utility_name_eia.str.contains(\"Duke Energy Corp\")) & (eia_df.state == \"NC\")].drop_duplicates()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a",
-   "metadata": {},
-   "source": [
-    "# Preprocessing"
+    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
-   "id": "39706c77-90db-4f49-8011-47a9777a88b6",
+   "execution_count": 244,
+   "id": "a38c45ad-56f3-49ad-bd62-fb91c4d89940",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+    "# if a subsidiary doesn't have a CIK and has a null location\n",
+    "# but its name was assigned a CIK (with a different location)\n",
+    "# then assign that CIK to the subsidiary\n",
+    "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n",
+    "    ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n",
+    "    ex21_with_cik[\"company_name_merge_cik\"]\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
-   "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07",
+   "execution_count": 245,
+   "id": "4cca9da1-8371-4b45-b88d-8c2911209707",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "ex21_df = prepare_ex21_df(raw_ex21_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "id": "11caf325-8530-430d-a3d2-a54043447021",
-   "metadata": {
-    "tags": []
-   },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "True"
+       "subsidiary_cik\n",
+       "True     191386\n",
+       "False       481\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 84,
+     "execution_count": 245,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# sec_df has filename as unique ID\n",
-    "sec_df.filename.is_unique"
+    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236",
+   "cell_type": "code",
+   "execution_count": 252,
+   "id": "e5b57a88-ffaa-4834-bea4-c5b4779bd551",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Note: not removing paragraph layout docs, but maybe should"
+    "archive = GCSArchive()\n",
+    "md = archive.get_metadata()"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2",
-   "metadata": {
-    "tags": []
-   },
+   "cell_type": "code",
+   "execution_count": 261,
+   "id": "a33be6e3-056f-4e4a-acd4-9a6dc6f98c90",
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "# Match Ex. 21 Subsidiaries to a SEC filer"
+    "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "01d3a5e1-ad17-4266-b2ef-358f246749db",
-   "metadata": {
-    "tags": []
-   },
+   "cell_type": "code",
+   "execution_count": 263,
+   "id": "d0dec8af-d730-4a06-af5e-f390fa228ac8",
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "## Preprocessing"
+    "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"], how=\"left\", left_on=\"filename\", right_index=True).rename(columns={\"cik\": \"parent_cik\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 264,
+   "id": "228a1d4b-bc19-49eb-b557-4f26d1febbd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 265,
+   "id": "c1b88c44-81d7-4d9d-a2a3-be1b030348bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>id</th>\n",
+       "      <th>company_name_raw</th>\n",
+       "      <th>loc_of_incorporation</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>company_name_no_legal</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "      <th>subsidiary_cik</th>\n",
+       "      <th>company_name_merge_cik</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>parent_cik</th>\n",
+       "      <th>sec_company_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>164482</th>\n",
+       "      <td>164482</td>\n",
+       "      <td>1000045-0000950170-23-030037</td>\n",
+       "      <td>nicholas data services, inc</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>nicholas data services incorporated</td>\n",
+       "      <td>nicholas data services</td>\n",
+       "      <td>NXLS TT SRFSS</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1000045/0000950170-23-030037.txt</td>\n",
+       "      <td>1000045</td>\n",
+       "      <td>1000045_1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>164481</th>\n",
+       "      <td>164481</td>\n",
+       "      <td>1000045-0000950170-23-030037</td>\n",
+       "      <td>nicholas financial, inc</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>nicholas financial incorporated</td>\n",
+       "      <td>nicholas financial</td>\n",
+       "      <td>NXLS FNNXL</td>\n",
+       "      <td>0001000045</td>\n",
+       "      <td>0001000045</td>\n",
+       "      <td>edgar/data/1000045/0000950170-23-030037.txt</td>\n",
+       "      <td>1000045</td>\n",
+       "      <td>0001000045</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>89</td>\n",
+       "      <td>1000209-0000950170-23-007273</td>\n",
+       "      <td>medallion bank</td>\n",
+       "      <td>utah</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>medallion bank</td>\n",
+       "      <td>medallion bank</td>\n",
+       "      <td>MTLN BNK</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1000209/0000950170-23-007273.txt</td>\n",
+       "      <td>1000209</td>\n",
+       "      <td>1000209_1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>88</th>\n",
+       "      <td>88</td>\n",
+       "      <td>1000209-0000950170-23-007273</td>\n",
+       "      <td>freshstart venture capital corp</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>freshstart venture capital corporation</td>\n",
+       "      <td>freshstart venture capital</td>\n",
+       "      <td>FRXSTRT FNTR KPTL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1000209/0000950170-23-007273.txt</td>\n",
+       "      <td>1000209</td>\n",
+       "      <td>1000209_2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>87</th>\n",
+       "      <td>87</td>\n",
+       "      <td>1000209-0000950170-23-007273</td>\n",
+       "      <td>medallion capital, inc</td>\n",
+       "      <td>minnesota</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>medallion capital incorporated</td>\n",
+       "      <td>medallion capital</td>\n",
+       "      <td>MTLN KPTL</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1000209/0000950170-23-007273.txt</td>\n",
+       "      <td>1000209</td>\n",
+       "      <td>1000209_3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>161957</th>\n",
+       "      <td>161957</td>\n",
+       "      <td>9984-0000009984-23-000060</td>\n",
+       "      <td>barnes molding solutions korea limited</td>\n",
+       "      <td>korea</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>barnes molding solutions korea limited</td>\n",
+       "      <td>barnes molding solutions korea</td>\n",
+       "      <td>BRNS MLTNK SLXNS KR</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/9984/0000009984-23-000060.txt</td>\n",
+       "      <td>9984</td>\n",
+       "      <td>9984_99</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>161956</th>\n",
+       "      <td>161956</td>\n",
+       "      <td>9984-0000009984-23-000060</td>\n",
+       "      <td>barnes molding solutions (jiangsu) co., ltd</td>\n",
+       "      <td>china</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>barnes molding solutions company limited</td>\n",
+       "      <td>barnes molding solutions</td>\n",
+       "      <td>BRNS MLTNK SLXNS</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/9984/0000009984-23-000060.txt</td>\n",
+       "      <td>9984</td>\n",
+       "      <td>9984_100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>161955</th>\n",
+       "      <td>161955</td>\n",
+       "      <td>9984-0000009984-23-000060</td>\n",
+       "      <td>barnes korea ltd</td>\n",
+       "      <td>korea</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>barnes korea limited</td>\n",
+       "      <td>barnes korea</td>\n",
+       "      <td>BRNS KR</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/9984/0000009984-23-000060.txt</td>\n",
+       "      <td>9984</td>\n",
+       "      <td>9984_101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>161965</th>\n",
+       "      <td>161965</td>\n",
+       "      <td>9984-0000009984-23-000060</td>\n",
+       "      <td>gimatic automation india pvt ltd</td>\n",
+       "      <td>india</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>gimatic automation india pvt limited</td>\n",
+       "      <td>gimatic automation india pvt</td>\n",
+       "      <td>JMTK ATMXN INT PFT</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/9984/0000009984-23-000060.txt</td>\n",
+       "      <td>9984</td>\n",
+       "      <td>9984_102</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>162018</th>\n",
+       "      <td>162018</td>\n",
+       "      <td>9984-0000009984-23-000060</td>\n",
+       "      <td>synventive molding solutions ltda</td>\n",
+       "      <td>brazil</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>synventive molding solutions ltda</td>\n",
+       "      <td>synventive molding solutions ltda</td>\n",
+       "      <td>SNFNTF MLTNK SLXNS LTT</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/9984/0000009984-23-000060.txt</td>\n",
+       "      <td>9984</td>\n",
+       "      <td>9984_103</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>191867 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        record_id                            id                             company_name_raw loc_of_incorporation own_per  report_year                              company_name              company_name_no_legal     company_name_mphone subsidiary_cik company_name_merge_cik                                     filename parent_cik sec_company_id\n",
+       "164482     164482  1000045-0000950170-23-030037                  nicholas data services, inc              florida   100.0         2023       nicholas data services incorporated             nicholas data services           NXLS TT SRFSS            NaN                    NaN  edgar/data/1000045/0000950170-23-030037.txt    1000045      1000045_1\n",
+       "164481     164481  1000045-0000950170-23-030037                      nicholas financial, inc              florida   100.0         2023           nicholas financial incorporated                 nicholas financial              NXLS FNNXL     0001000045             0001000045  edgar/data/1000045/0000950170-23-030037.txt    1000045     0001000045\n",
+       "89             89  1000209-0000950170-23-007273                               medallion bank                 utah     NaN         2023                            medallion bank                     medallion bank                MTLN BNK            NaN                    NaN  edgar/data/1000209/0000950170-23-007273.txt    1000209      1000209_1\n",
+       "88             88  1000209-0000950170-23-007273              freshstart venture capital corp             new york     NaN         2023    freshstart venture capital corporation         freshstart venture capital       FRXSTRT FNTR KPTL            NaN                    NaN  edgar/data/1000209/0000950170-23-007273.txt    1000209      1000209_2\n",
+       "87             87  1000209-0000950170-23-007273                       medallion capital, inc            minnesota     NaN         2023            medallion capital incorporated                  medallion capital               MTLN KPTL            NaN                    NaN  edgar/data/1000209/0000950170-23-007273.txt    1000209      1000209_3\n",
+       "...           ...                           ...                                          ...                  ...     ...          ...                                       ...                                ...                     ...            ...                    ...                                          ...        ...            ...\n",
+       "161957     161957     9984-0000009984-23-000060       barnes molding solutions korea limited                korea     NaN         2023    barnes molding solutions korea limited     barnes molding solutions korea     BRNS MLTNK SLXNS KR            NaN                    NaN     edgar/data/9984/0000009984-23-000060.txt       9984        9984_99\n",
+       "161956     161956     9984-0000009984-23-000060  barnes molding solutions (jiangsu) co., ltd                china     NaN         2023  barnes molding solutions company limited           barnes molding solutions        BRNS MLTNK SLXNS            NaN                    NaN     edgar/data/9984/0000009984-23-000060.txt       9984       9984_100\n",
+       "161955     161955     9984-0000009984-23-000060                             barnes korea ltd                korea     NaN         2023                      barnes korea limited                       barnes korea                 BRNS KR            NaN                    NaN     edgar/data/9984/0000009984-23-000060.txt       9984       9984_101\n",
+       "161965     161965     9984-0000009984-23-000060             gimatic automation india pvt ltd                india     NaN         2023      gimatic automation india pvt limited       gimatic automation india pvt      JMTK ATMXN INT PFT            NaN                    NaN     edgar/data/9984/0000009984-23-000060.txt       9984       9984_102\n",
+       "162018     162018     9984-0000009984-23-000060            synventive molding solutions ltda               brazil     NaN         2023         synventive molding solutions ltda  synventive molding solutions ltda  SNFNTF MLTNK SLXNS LTT            NaN                    NaN     edgar/data/9984/0000009984-23-000060.txt       9984       9984_103\n",
+       "\n",
+       "[191867 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 265,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex21_with_cik"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 266,
+   "id": "192d3cac-b156-4e5c-8148-0cbdc3e8900d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik.to_parquet(\"ex21_2023.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Match Ex. 21 Subsidiaries to a SEC filer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01d3a5e1-ad17-4266-b2ef-358f246749db",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Preprocessing"
    ]
   },
   {
@@ -1136,7 +1072,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 24,
    "id": "24890018-8efb-445f-ad91-ca316edccbe8",
    "metadata": {},
    "outputs": [],
@@ -1146,7 +1082,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 25,
    "id": "83f859df-1764-4e97-addc-0064bdcb31b7",
    "metadata": {
     "tags": []
@@ -1156,12 +1092,12 @@
      "data": {
       "text/plain": [
        "loc_of_incorporation\n",
-       "False    6359\n",
-       "True      748\n",
+       "False    6382\n",
+       "True      749\n",
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 87,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1172,7 +1108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 26,
    "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0",
    "metadata": {
     "tags": []
@@ -1192,7 +1128,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 71,
    "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3",
    "metadata": {
     "tags": []
@@ -1202,30 +1138,30 @@
      "data": {
       "text/plain": [
        "company_name\n",
-       "rush truck center                                           120\n",
-       "encompass health rehabilitation hospital                     79\n",
-       "rush peterbilt truck center                                  57\n",
-       "branch                                                       52\n",
-       "sci funeral services, llc iowa limited liability company     33\n",
-       "partnership limited partnership                              32\n",
-       "alderwoods group, llc de limited liability company           27\n",
-       "encompass health rehabilitation hospital of                  26\n",
-       "u haul co. of                                                26\n",
-       "at&t                                                         25\n",
-       "corporation                                                  21\n",
-       "amh portfolio management                                     20\n",
-       "rush bus center                                              20\n",
-       "limited partnership limited partnership                      18\n",
-       "rapy limited partnership                                     15\n",
-       "rush isuzu trucks                                            15\n",
-       "colgate palmolive limited                                    14\n",
-       "ecolab limited                                               11\n",
-       "rush truck centres                                           11\n",
-       "johnson and johnson limited                                  11\n",
+       "rush truck center                                          120\n",
+       "encompass health rehabilitation hospital                    79\n",
+       "rush peterbilt truck center                                 57\n",
+       "branch                                                      52\n",
+       "sci funeral services llc iowa limited liability company     33\n",
+       "partnership limited partnership                             32\n",
+       "alderwoods group llc de limited liability company           27\n",
+       "encompass health rehabilitation hospital of                 26\n",
+       "u haul co of                                                26\n",
+       "at and t                                                    25\n",
+       "corporation                                                 21\n",
+       "amh portfolio management                                    20\n",
+       "rush bus center                                             20\n",
+       "limited partnership limited partnership                     18\n",
+       "therapy limited partnership                                 15\n",
+       "rush isuzu trucks                                           15\n",
+       "colgate palmolive limited                                   14\n",
+       "johnson and johnson limited                                 11\n",
+       "ecolab limited                                              11\n",
+       "rush truck centres                                          11\n",
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 89,
+     "execution_count": 71,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1236,7 +1172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 72,
    "id": "8a4839e5-a2e5-4098-826a-4d340cdde638",
    "metadata": {
     "tags": []
@@ -1247,6 +1183,48 @@
     "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "baab7dfc-4efb-4c08-b090-32dd47025e15",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n",
+      "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:3: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")\n"
+     ]
+    }
+   ],
+   "source": [
+    "# TEMP\n",
+    "sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n",
+    "ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "a1a6634e-e554-4a94-8a57-c2755048db22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_match_df.loc[:, \"loc_list\"] = sec_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")\n",
+    "ex21_match_df.loc[:, \"loc_list\"] = ex21_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "c294372b-159c-4c90-a031-61c34532b965",
@@ -1257,7 +1235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 75,
    "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf",
    "metadata": {
     "tags": []
@@ -1272,7 +1250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 76,
    "id": "422ca098-e4e7-4284-8b04-74e976e36023",
    "metadata": {
     "tags": []
@@ -1284,7 +1262,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": 77,
    "id": "232b5718-c1ed-4e63-8384-b4acf33210d3",
    "metadata": {
     "tags": []
@@ -1295,115 +1273,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed {\n",
-       "    width: 100%;\n",
-       "    display: flex;\n",
-       "  }\n",
-       "\n",
-       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed details,\n",
-       "  #altair-viz-9fc6a32a878140b19ee19cd6c6006b48.vega-embed details summary {\n",
-       "    position: relative;\n",
-       "  }\n",
-       "</style>\n",
-       "<div id=\"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\"></div>\n",
-       "<script type=\"text/javascript\">\n",
-       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
-       "  (function(spec, embedOpt){\n",
-       "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-9fc6a32a878140b19ee19cd6c6006b48\");\n",
-       "    }\n",
-       "    const paths = {\n",
-       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
-       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
-       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
-       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
-       "    };\n",
-       "\n",
-       "    function maybeLoadScript(lib, version) {\n",
-       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
-       "      return (VEGA_DEBUG[key] == version) ?\n",
-       "        Promise.resolve(paths[lib]) :\n",
-       "        new Promise(function(resolve, reject) {\n",
-       "          var s = document.createElement('script');\n",
-       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
-       "          s.async = true;\n",
-       "          s.onload = () => {\n",
-       "            VEGA_DEBUG[key] = version;\n",
-       "            return resolve(paths[lib]);\n",
-       "          };\n",
-       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
-       "          s.src = paths[lib];\n",
-       "        });\n",
-       "    }\n",
-       "\n",
-       "    function showError(err) {\n",
-       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
-       "      throw err;\n",
-       "    }\n",
-       "\n",
-       "    function displayChart(vegaEmbed) {\n",
-       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
-       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
-       "    }\n",
-       "\n",
-       "    if(typeof define === \"function\" && define.amd) {\n",
-       "      requirejs.config({paths});\n",
-       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
-       "    } else {\n",
-       "      maybeLoadScript(\"vega\", \"5\")\n",
-       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
-       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
-       "        .catch(showError)\n",
-       "        .then(() => displayChart(vegaEmbed));\n",
-       "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-08289b57cb7a9ca1ff1da3e2ddccde42\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-08289b57cb7a9ca1ff1da3e2ddccde42\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 192164, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 1, \"total_rows_inc_nulls\": 192164, \"completeness\": 0.9999948143959045}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 18779, \"total_rows_inc_nulls\": 192164, \"completeness\": 0.9022761583328247}]}}, {\"mode\": \"vega-lite\"});\n",
-       "</script>"
-      ],
-      "text/plain": [
-       "alt.LayerChart(...)"
-      ]
-     },
-     "execution_count": 93,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# sometimes this will show up as 100% non null in loc_of_incorporation, not sure why\n",
-    "completeness_chart(ex21_match_df[match_cols], db_api=db_api)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "id": "520a9b86",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "<style>\n",
-       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed {\n",
+       "  #altair-viz-9682c8e8aeed4edb9bbb7570564bf923.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed details,\n",
-       "  #altair-viz-0fe59c00c8af4561818cd26f7b170021.vega-embed details summary {\n",
+       "  #altair-viz-9682c8e8aeed4edb9bbb7570564bf923.vega-embed details,\n",
+       "  #altair-viz-9682c8e8aeed4edb9bbb7570564bf923.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-0fe59c00c8af4561818cd26f7b170021\"></div>\n",
+       "<div id=\"altair-viz-9682c8e8aeed4edb9bbb7570564bf923\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-0fe59c00c8af4561818cd26f7b170021\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-0fe59c00c8af4561818cd26f7b170021\");\n",
+       "    if (outputDiv.id !== \"altair-viz-9682c8e8aeed4edb9bbb7570564bf923\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-9682c8e8aeed4edb9bbb7570564bf923\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1449,20 +1335,21 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-0ad83db79741c92ff59a5e8e4b65695b\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-0ad83db79741c92ff59a5e8e4b65695b\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7107, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7107, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 748, \"total_rows_inc_nulls\": 7107, \"completeness\": 0.8947516679763794}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-81699f80f14acae99268411e873cd701\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-81699f80f14acae99268411e873cd701\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 191939, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_2\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7131, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 1, \"total_rows_inc_nulls\": 191939, \"completeness\": 0.9999948143959045}, {\"source_dataset\": \"input_data_2\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 7131, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 18757, \"total_rows_inc_nulls\": 191939, \"completeness\": 0.9022762179374695}, {\"source_dataset\": \"input_data_2\", \"column_name\": \"loc_of_incorporation\", \"total_null_rows\": 749, \"total_rows_inc_nulls\": 7131, \"completeness\": 0.894965648651123}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 94,
+     "execution_count": 77,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "completeness_chart(sec_match_df[match_cols], db_api=db_api)"
+    "# sometimes this will show up as 100% complete in loc_of_incorporation, not sure why\n",
+    "completeness_chart([ex21_match_df[match_cols], sec_match_df[match_cols]], db_api=db_api)"
    ]
   },
   {
@@ -1475,7 +1362,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 78,
    "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840",
    "metadata": {
     "tags": []
@@ -1486,23 +1373,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed {\n",
+       "  #altair-viz-7f5b8ba744e84724a4c2cfa748390a95.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed details,\n",
-       "  #altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6.vega-embed details summary {\n",
+       "  #altair-viz-7f5b8ba744e84724a4c2cfa748390a95.vega-embed details,\n",
+       "  #altair-viz-7f5b8ba744e84724a4c2cfa748390a95.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\"></div>\n",
+       "<div id=\"altair-viz-7f5b8ba744e84724a4c2cfa748390a95\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-fe7058e71c9d4ecda1e3ac05015e93b6\");\n",
+       "    if (outputDiv.id !== \"altair-viz-7f5b8ba744e84724a4c2cfa748390a95\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7f5b8ba744e84724a4c2cfa748390a95\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1548,14 +1435,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 192164, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 192164.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 192164, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 192164.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 192164, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 192164, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 192164, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 192164]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9993755221366882, \"percentile_inc_nulls\": 0.9993755221366882, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9989644289016724, \"percentile_inc_nulls\": 0.9989644289016724, \"value_count\": 79, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9986677765846252, \"percentile_inc_nulls\": 0.9986677765846252, \"value_count\": 57, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.998397171497345, \"percentile_inc_nulls\": 0.9983972311019897, \"value_count\": 52, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9982254505157471, \"percentile_inc_nulls\": 0.9982254505157471, \"value_count\": 33, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9980589151382446, \"percentile_inc_nulls\": 0.9980589747428894, \"value_count\": 32, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.997918426990509, \"percentile_inc_nulls\": 0.997918426990509, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 27.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9976478219032288, \"percentile_inc_nulls\": 0.9976478219032288, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9975177049636841, \"percentile_inc_nulls\": 0.9975177645683289, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9974084496498108, \"percentile_inc_nulls\": 0.9974084496498108, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9972003102302551, \"percentile_inc_nulls\": 0.9972003102302551, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9971066117286682, \"percentile_inc_nulls\": 0.9971066117286682, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9969505071640015, \"percentile_inc_nulls\": 0.9969505071640015, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9968776702880859, \"percentile_inc_nulls\": 0.9968776702880859, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.996705949306488, \"percentile_inc_nulls\": 0.996705949306488, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9963936805725098, \"percentile_inc_nulls\": 0.9963936805725098, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9961127042770386, \"percentile_inc_nulls\": 0.9961127042770386, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9957379698753357, \"percentile_inc_nulls\": 0.9957380294799805, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9952279925346375, \"percentile_inc_nulls\": 0.9952280521392822, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9941352009773254, \"percentile_inc_nulls\": 0.9941352009773254, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9931464195251465, \"percentile_inc_nulls\": 0.9931464791297913, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9913563132286072, \"percentile_inc_nulls\": 0.9913563132286072, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9881871342658997, \"percentile_inc_nulls\": 0.9881871938705444, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 609.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.9721382260322571, \"percentile_inc_nulls\": 0.9721384048461914, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3084.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 5.185604095458984e-06, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 186809.0, \"distinct_value_count\": 188768}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 1 values (0.0%) are null and there are 188768 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 120, \"group_name\": \"_company_name_\", \"value\": \"rush truck center\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 79, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 57, \"group_name\": \"_company_name_\", \"value\": \"rush peterbilt truck center\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 52, \"group_name\": \"_company_name_\", \"value\": \"branch\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 33, \"group_name\": \"_company_name_\", \"value\": \"sci funeral services, llc iowa limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 32, \"group_name\": \"_company_name_\", \"value\": \"partnership limited partnership\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 27, \"group_name\": \"_company_name_\", \"value\": \"alderwoods group, llc de limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"u haul co. of\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital of\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 25, \"group_name\": \"_company_name_\", \"value\": \"at&t\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico global mobility, limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico global real estate, incorporated\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico group finance international b.v\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico logistyka sp. z o.o\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"pepsico y limited liability company\", \"total_non_null_rows\": 192163, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 188768}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 120]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.032447993755340576, \"percentile_inc_nulls\": 0.1270008683204651, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03099459409713745, \"percentile_inc_nulls\": 0.12568950653076172, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02970266342163086, \"percentile_inc_nulls\": 0.1245238184928894, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02812814712524414, \"percentile_inc_nulls\": 0.12310320138931274, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.026121079921722412, \"percentile_inc_nulls\": 0.12129223346710205, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.02349686622619629, \"percentile_inc_nulls\": 0.11892443895339966, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.020636141300201416, \"percentile_inc_nulls\": 0.11634331941604614, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.01705455780029297, \"percentile_inc_nulls\": 0.11311173439025879, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.01166766881942749, \"percentile_inc_nulls\": 0.10825127363204956, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 934.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.09772384166717529, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2023.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.6351702809333801, \"percentile_inc_nulls\": 0.6708228588104248, \"value_count\": 63256, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 63256.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.6025030612945557, \"percentile_inc_nulls\": 0.6413480043411255, \"value_count\": 5664, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 5664.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5844392776489258, \"percentile_inc_nulls\": 0.6250494718551636, \"value_count\": 3132, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3132.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5671367049217224, \"percentile_inc_nulls\": 0.6094377636909485, \"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 3000.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5508146286010742, \"percentile_inc_nulls\": 0.5947107672691345, \"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2830.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5346598625183105, \"percentile_inc_nulls\": 0.5801346898078918, \"value_count\": 2801, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2801.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5185166001319885, \"percentile_inc_nulls\": 0.5655689835548401, \"value_count\": 2799, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2799.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.5036594867706299, \"percentile_inc_nulls\": 0.5521637797355652, \"value_count\": 2576, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2576.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.489102303981781, \"percentile_inc_nulls\": 0.5390291213989258, \"value_count\": 2524, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2524.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4755774736404419, \"percentile_inc_nulls\": 0.526826024055481, \"value_count\": 2345, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2345.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.46293509006500244, \"percentile_inc_nulls\": 0.5154191255569458, \"value_count\": 2192, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2192.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.450356125831604, \"percentile_inc_nulls\": 0.504069447517395, \"value_count\": 2181, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2181.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.43872886896133423, \"percentile_inc_nulls\": 0.49357837438583374, \"value_count\": 2016, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 2016.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4272860884666443, \"percentile_inc_nulls\": 0.4832538962364197, \"value_count\": 1984, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1984.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4172448515892029, \"percentile_inc_nulls\": 0.4741939306259155, \"value_count\": 1741, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1741.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.4076130986213684, \"percentile_inc_nulls\": 0.46550339460372925, \"value_count\": 1670, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1670.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3980448246002197, \"percentile_inc_nulls\": 0.4568701982498169, \"value_count\": 1659, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1659.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.38943392038345337, \"percentile_inc_nulls\": 0.4491007924079895, \"value_count\": 1493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1493.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.38099604845046997, \"percentile_inc_nulls\": 0.44148749113082886, \"value_count\": 1463, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1463.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.372806191444397, \"percentile_inc_nulls\": 0.43409794569015503, \"value_count\": 1420, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1420.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.36476051807403564, \"percentile_inc_nulls\": 0.4268385171890259, \"value_count\": 1395, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1395.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.35703206062316895, \"percentile_inc_nulls\": 0.41986531019210815, \"value_count\": 1340, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1340.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3493266701698303, \"percentile_inc_nulls\": 0.41291290521621704, \"value_count\": 1336, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1336.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3420711159706116, \"percentile_inc_nulls\": 0.40636640787124634, \"value_count\": 1258, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1258.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.33516740798950195, \"percentile_inc_nulls\": 0.40013736486434937, \"value_count\": 1197, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1197.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.32851743698120117, \"percentile_inc_nulls\": 0.39413732290267944, \"value_count\": 1153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1153.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.3222770094871521, \"percentile_inc_nulls\": 0.3885067105293274, \"value_count\": 1082, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1082.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.31607115268707275, \"percentile_inc_nulls\": 0.38290733098983765, \"value_count\": 1076, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.30991148948669434, \"percentile_inc_nulls\": 0.3773495554924011, \"value_count\": 1068, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.30378061532974243, \"percentile_inc_nulls\": 0.37181782722473145, \"value_count\": 1063, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2978919744491577, \"percentile_inc_nulls\": 0.3665046691894531, \"value_count\": 1021, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1021.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2920668125152588, \"percentile_inc_nulls\": 0.3612487316131592, \"value_count\": 1010, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2864896059036255, \"percentile_inc_nulls\": 0.35621654987335205, \"value_count\": 967, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 967.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2809297442436218, \"percentile_inc_nulls\": 0.35120004415512085, \"value_count\": 964, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 964.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2754044532775879, \"percentile_inc_nulls\": 0.3462147116661072, \"value_count\": 958, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 958.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.27058857679367065, \"percentile_inc_nulls\": 0.3418694734573364, \"value_count\": 835, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 835.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2658880352973938, \"percentile_inc_nulls\": 0.3376283049583435, \"value_count\": 815, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 815.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.26138365268707275, \"percentile_inc_nulls\": 0.33356404304504395, \"value_count\": 781, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 781.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.25697147846221924, \"percentile_inc_nulls\": 0.3295830488204956, \"value_count\": 765, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 765.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.252588152885437, \"percentile_inc_nulls\": 0.3256281018257141, \"value_count\": 760, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.248210608959198, \"percentile_inc_nulls\": 0.3216783404350281, \"value_count\": 759, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 759.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.24416184425354004, \"percentile_inc_nulls\": 0.31802523136138916, \"value_count\": 702, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.24016493558883667, \"percentile_inc_nulls\": 0.3144189119338989, \"value_count\": 693, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 693.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2362315058708191, \"percentile_inc_nulls\": 0.31086987257003784, \"value_count\": 682, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 682.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.23252874612808228, \"percentile_inc_nulls\": 0.3075289726257324, \"value_count\": 642, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 642.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.22886639833450317, \"percentile_inc_nulls\": 0.30422449111938477, \"value_count\": 635, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 635.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.22521555423736572, \"percentile_inc_nulls\": 0.3009304404258728, \"value_count\": 633, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 633.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.2216973900794983, \"percentile_inc_nulls\": 0.2977560758590698, \"value_count\": 610, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 610.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21819651126861572, \"percentile_inc_nulls\": 0.294597327709198, \"value_count\": 607, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 607.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21485131978988647, \"percentile_inc_nulls\": 0.29157906770706177, \"value_count\": 580, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.21168500185012817, \"percentile_inc_nulls\": 0.2887221574783325, \"value_count\": 549, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 549.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20855897665023804, \"percentile_inc_nulls\": 0.28590160608291626, \"value_count\": 542, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 542.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20568102598190308, \"percentile_inc_nulls\": 0.2833048701286316, \"value_count\": 499, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 499.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20283758640289307, \"percentile_inc_nulls\": 0.28073936700820923, \"value_count\": 493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 493.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.20003461837768555, \"percentile_inc_nulls\": 0.27821028232574463, \"value_count\": 486, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 486.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.19467079639434814, \"percentile_inc_nulls\": 0.2733706831932068, \"value_count\": 465, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 930.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.19212734699249268, \"percentile_inc_nulls\": 0.2710757255554199, \"value_count\": 441, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 441.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1896357536315918, \"percentile_inc_nulls\": 0.2688276767730713, \"value_count\": 432, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 432.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18717306852340698, \"percentile_inc_nulls\": 0.2666056156158447, \"value_count\": 427, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18473917245864868, \"percentile_inc_nulls\": 0.26440954208374023, \"value_count\": 422, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 422.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.18231678009033203, \"percentile_inc_nulls\": 0.2622239589691162, \"value_count\": 420, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17991751432418823, \"percentile_inc_nulls\": 0.260059118270874, \"value_count\": 416, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17752975225448608, \"percentile_inc_nulls\": 0.25790470838546753, \"value_count\": 414, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1751651167869568, \"percentile_inc_nulls\": 0.25577110052108765, \"value_count\": 410, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.17281192541122437, \"percentile_inc_nulls\": 0.25364792346954346, \"value_count\": 408, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16838252544403076, \"percentile_inc_nulls\": 0.24965131282806396, \"value_count\": 384, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 768.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16626006364822388, \"percentile_inc_nulls\": 0.24773633480072021, \"value_count\": 368, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1642356514930725, \"percentile_inc_nulls\": 0.24590975046157837, \"value_count\": 351, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.16222858428955078, \"percentile_inc_nulls\": 0.24409878253936768, \"value_count\": 348, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15823745727539062, \"percentile_inc_nulls\": 0.24049770832061768, \"value_count\": 346, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 692.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15429246425628662, \"percentile_inc_nulls\": 0.2369382381439209, \"value_count\": 342, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15233731269836426, \"percentile_inc_nulls\": 0.23517411947250366, \"value_count\": 339, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.15039938688278198, \"percentile_inc_nulls\": 0.23342561721801758, \"value_count\": 336, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14853650331497192, \"percentile_inc_nulls\": 0.23174476623535156, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14669668674468994, \"percentile_inc_nulls\": 0.23008471727371216, \"value_count\": 319, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 319.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1448625922203064, \"percentile_inc_nulls\": 0.2284298539161682, \"value_count\": 318, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.14124059677124023, \"percentile_inc_nulls\": 0.2251618504524231, \"value_count\": 314, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 628.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.13776856660842896, \"percentile_inc_nulls\": 0.22202908992767334, \"value_count\": 301, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 602.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.13604408502578735, \"percentile_inc_nulls\": 0.22047311067581177, \"value_count\": 299, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 299.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1343311071395874, \"percentile_inc_nulls\": 0.2189275622367859, \"value_count\": 297, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1309167742729187, \"percentile_inc_nulls\": 0.21584689617156982, \"value_count\": 296, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12926721572875977, \"percentile_inc_nulls\": 0.21435856819152832, \"value_count\": 286, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12764078378677368, \"percentile_inc_nulls\": 0.2128911018371582, \"value_count\": 282, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12603163719177246, \"percentile_inc_nulls\": 0.21143919229507446, \"value_count\": 279, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1245032548904419, \"percentile_inc_nulls\": 0.21006017923355103, \"value_count\": 265, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12307292222976685, \"percentile_inc_nulls\": 0.20876961946487427, \"value_count\": 248, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.12023532390594482, \"percentile_inc_nulls\": 0.20620930194854736, \"value_count\": 246, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11886841058731079, \"percentile_inc_nulls\": 0.20497596263885498, \"value_count\": 237, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11756497621536255, \"percentile_inc_nulls\": 0.20379990339279175, \"value_count\": 226, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11626726388931274, \"percentile_inc_nulls\": 0.20262902975082397, \"value_count\": 225, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11371803283691406, \"percentile_inc_nulls\": 0.20032888650894165, \"value_count\": 221, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.11124950647354126, \"percentile_inc_nulls\": 0.19810163974761963, \"value_count\": 214, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10880410671234131, \"percentile_inc_nulls\": 0.19589519500732422, \"value_count\": 212, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10758715867996216, \"percentile_inc_nulls\": 0.19479715824127197, \"value_count\": 211, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10640484094619751, \"percentile_inc_nulls\": 0.19373035430908203, \"value_count\": 205, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10524553060531616, \"percentile_inc_nulls\": 0.1926843523979187, \"value_count\": 201, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.1040920615196228, \"percentile_inc_nulls\": 0.1916435956954956, \"value_count\": 200, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10294431447982788, \"percentile_inc_nulls\": 0.19060802459716797, \"value_count\": 199, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10180234909057617, \"percentile_inc_nulls\": 0.1895776391029358, \"value_count\": 198, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.10066616535186768, \"percentile_inc_nulls\": 0.18855249881744385, \"value_count\": 197, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09953570365905762, \"percentile_inc_nulls\": 0.18753254413604736, \"value_count\": 196, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09730947017669678, \"percentile_inc_nulls\": 0.18552380800247192, \"value_count\": 193, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09518700838088989, \"percentile_inc_nulls\": 0.1836087703704834, \"value_count\": 184, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09414887428283691, \"percentile_inc_nulls\": 0.18267208337783813, \"value_count\": 180, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09315109252929688, \"percentile_inc_nulls\": 0.18177181482315063, \"value_count\": 173, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09216481447219849, \"percentile_inc_nulls\": 0.18088197708129883, \"value_count\": 171, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09119009971618652, \"percentile_inc_nulls\": 0.18000251054763794, \"value_count\": 169, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.09025001525878906, \"percentile_inc_nulls\": 0.17915427684783936, \"value_count\": 163, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 163.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08933299779891968, \"percentile_inc_nulls\": 0.17832684516906738, \"value_count\": 159, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08842170238494873, \"percentile_inc_nulls\": 0.17750465869903564, \"value_count\": 158, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08751624822616577, \"percentile_inc_nulls\": 0.1766875982284546, \"value_count\": 157, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08663380146026611, \"percentile_inc_nulls\": 0.17589139938354492, \"value_count\": 153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08580905199050903, \"percentile_inc_nulls\": 0.17514729499816895, \"value_count\": 143, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08501309156417847, \"percentile_inc_nulls\": 0.17442911863327026, \"value_count\": 138, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08423453569412231, \"percentile_inc_nulls\": 0.1737266182899475, \"value_count\": 135, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08347898721694946, \"percentile_inc_nulls\": 0.17304491996765137, \"value_count\": 131, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08272916078567505, \"percentile_inc_nulls\": 0.17236840724945068, \"value_count\": 130, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08198517560958862, \"percentile_inc_nulls\": 0.17169708013534546, \"value_count\": 129, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08124691247940063, \"percentile_inc_nulls\": 0.17103099822998047, \"value_count\": 128, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.08052021265029907, \"percentile_inc_nulls\": 0.1703752875328064, \"value_count\": 126, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07908987998962402, \"percentile_inc_nulls\": 0.16908472776412964, \"value_count\": 124, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07838046550750732, \"percentile_inc_nulls\": 0.16844463348388672, \"value_count\": 123, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07767683267593384, \"percentile_inc_nulls\": 0.16780978441238403, \"value_count\": 122, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07697898149490356, \"percentile_inc_nulls\": 0.1671801209449768, \"value_count\": 121, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07629263401031494, \"percentile_inc_nulls\": 0.1665608286857605, \"value_count\": 119, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07563513517379761, \"percentile_inc_nulls\": 0.16596758365631104, \"value_count\": 114, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07498341798782349, \"percentile_inc_nulls\": 0.1653795838356018, \"value_count\": 113, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07434898614883423, \"percentile_inc_nulls\": 0.16480714082717896, \"value_count\": 110, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07373183965682983, \"percentile_inc_nulls\": 0.16425031423568726, \"value_count\": 107, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07253223657608032, \"percentile_inc_nulls\": 0.16316789388656616, \"value_count\": 104, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07195550203323364, \"percentile_inc_nulls\": 0.162647545337677, \"value_count\": 100, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0713844895362854, \"percentile_inc_nulls\": 0.16213232278823853, \"value_count\": 99, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07081925868988037, \"percentile_inc_nulls\": 0.16162234544754028, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.07026559114456177, \"percentile_inc_nulls\": 0.16112279891967773, \"value_count\": 96, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06971770524978638, \"percentile_inc_nulls\": 0.16062843799591064, \"value_count\": 95, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06918132305145264, \"percentile_inc_nulls\": 0.16014444828033447, \"value_count\": 93, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06815469264984131, \"percentile_inc_nulls\": 0.1592181921005249, \"value_count\": 89, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06764715909957886, \"percentile_inc_nulls\": 0.15876024961471558, \"value_count\": 88, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06714534759521484, \"percentile_inc_nulls\": 0.1583074927330017, \"value_count\": 87, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06666088104248047, \"percentile_inc_nulls\": 0.157870352268219, \"value_count\": 84, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06618797779083252, \"percentile_inc_nulls\": 0.15744364261627197, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06573230028152466, \"percentile_inc_nulls\": 0.1570325493812561, \"value_count\": 79, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06528246402740479, \"percentile_inc_nulls\": 0.1566266417503357, \"value_count\": 78, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06483834981918335, \"percentile_inc_nulls\": 0.15622591972351074, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.0644230842590332, \"percentile_inc_nulls\": 0.15585124492645264, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06361567974090576, \"percentile_inc_nulls\": 0.15512269735336304, \"value_count\": 70, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06284278631210327, \"percentile_inc_nulls\": 0.15442538261413574, \"value_count\": 67, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06246793270111084, \"percentile_inc_nulls\": 0.1540871262550354, \"value_count\": 65, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06175273656845093, \"percentile_inc_nulls\": 0.15344184637069702, \"value_count\": 62, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06140094995498657, \"percentile_inc_nulls\": 0.1531243920326233, \"value_count\": 61, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06036275625228882, \"percentile_inc_nulls\": 0.15218770503997803, \"value_count\": 60, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.06002247333526611, \"percentile_inc_nulls\": 0.15188068151474, \"value_count\": 59, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05868440866470337, \"percentile_inc_nulls\": 0.15067338943481445, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05835568904876709, \"percentile_inc_nulls\": 0.15037673711776733, \"value_count\": 57, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05803269147872925, \"percentile_inc_nulls\": 0.15008533000946045, \"value_count\": 56, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05708104372024536, \"percentile_inc_nulls\": 0.14922672510147095, \"value_count\": 55, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.056769609451293945, \"percentile_inc_nulls\": 0.14894568920135498, \"value_count\": 54, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05647546052932739, \"percentile_inc_nulls\": 0.14868026971817017, \"value_count\": 51, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.056192874908447266, \"percentile_inc_nulls\": 0.14842528104782104, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05536234378814697, \"percentile_inc_nulls\": 0.14767593145370483, \"value_count\": 48, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.055091261863708496, \"percentile_inc_nulls\": 0.1474313735961914, \"value_count\": 47, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05483752489089966, \"percentile_inc_nulls\": 0.14720237255096436, \"value_count\": 44, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05409348011016846, \"percentile_inc_nulls\": 0.1465311050415039, \"value_count\": 43, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.053366780281066895, \"percentile_inc_nulls\": 0.14587539434432983, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05242091417312622, \"percentile_inc_nulls\": 0.1450219750404358, \"value_count\": 41, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05195951461791992, \"percentile_inc_nulls\": 0.1446056365966797, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05150961875915527, \"percentile_inc_nulls\": 0.14419972896575928, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.05107128620147705, \"percentile_inc_nulls\": 0.14380425214767456, \"value_count\": 38, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.050431132316589355, \"percentile_inc_nulls\": 0.14322662353515625, \"value_count\": 37, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.050223469734191895, \"percentile_inc_nulls\": 0.1430392861366272, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04943913221359253, \"percentile_inc_nulls\": 0.1423315405845642, \"value_count\": 34, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.049081504344940186, \"percentile_inc_nulls\": 0.14200890064239502, \"value_count\": 31, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04873549938201904, \"percentile_inc_nulls\": 0.14169669151306152, \"value_count\": 30, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04806643724441528, \"percentile_inc_nulls\": 0.14109301567077637, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04774343967437744, \"percentile_inc_nulls\": 0.14080160856246948, \"value_count\": 28, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04712057113647461, \"percentile_inc_nulls\": 0.14023959636688232, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04667067527770996, \"percentile_inc_nulls\": 0.13983368873596191, \"value_count\": 26, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04566138982772827, \"percentile_inc_nulls\": 0.13892298936843872, \"value_count\": 25, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04538452625274658, \"percentile_inc_nulls\": 0.13867324590682983, \"value_count\": 24, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04445600509643555, \"percentile_inc_nulls\": 0.13783538341522217, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.043567776679992676, \"percentile_inc_nulls\": 0.13703399896621704, \"value_count\": 22, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.043204426765441895, \"percentile_inc_nulls\": 0.1367061734199524, \"value_count\": 21, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04285836219787598, \"percentile_inc_nulls\": 0.13639390468597412, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04187214374542236, \"percentile_inc_nulls\": 0.13550406694412231, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.04093778133392334, \"percentile_inc_nulls\": 0.1346610188484192, \"value_count\": 18, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.040153443813323975, \"percentile_inc_nulls\": 0.1339532732963562, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.039415180683135986, \"percentile_inc_nulls\": 0.1332871913909912, \"value_count\": 16, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03829050064086914, \"percentile_inc_nulls\": 0.13227242231369019, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03716009855270386, \"percentile_inc_nulls\": 0.1312524676322937, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.036185383796691895, \"percentile_inc_nulls\": 0.1303730010986328, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03473198413848877, \"percentile_inc_nulls\": 0.12906163930892944, \"value_count\": 12, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 0.03371685743331909, \"percentile_inc_nulls\": 0.1281457543373108, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 3458}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 18,779 values (9.8%) are null and there are 3458 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 63256, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 5664, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"united kingdom\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 3132, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"netherlands\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"germany\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2801, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"cayman islands\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2799, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2576, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"china\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2524, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 2345, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"australia\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"jersey islanddelaware\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"tanzania, united republic of\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"albany\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"calallen\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"private uk\", \"total_non_null_rows\": 173385, \"total_rows_inc_nulls\": 192164, \"distinct_value_count\": 3458}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 63256]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 191939, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 191939, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 191939.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 191939, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 191939, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 191939.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 191939, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 191939, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 191939, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 191939, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 191939]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9993748068809509, \"percentile_inc_nulls\": 0.9993748068809509, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9989631772041321, \"percentile_inc_nulls\": 0.9989632368087769, \"value_count\": 79, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9986662268638611, \"percentile_inc_nulls\": 0.9986662268638611, \"value_count\": 57, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9983953237533569, \"percentile_inc_nulls\": 0.9983953237533569, \"value_count\": 52, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9982233643531799, \"percentile_inc_nulls\": 0.9982233643531799, \"value_count\": 33, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9980566501617432, \"percentile_inc_nulls\": 0.9980566501617432, \"value_count\": 32, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9979159832000732, \"percentile_inc_nulls\": 0.9979159832000732, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 27.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9976450800895691, \"percentile_inc_nulls\": 0.9976450800895691, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9975148439407349, \"percentile_inc_nulls\": 0.9975148439407349, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9974054098129272, \"percentile_inc_nulls\": 0.9974054098129272, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9971970319747925, \"percentile_inc_nulls\": 0.9971970319747925, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.997103214263916, \"percentile_inc_nulls\": 0.9971032738685608, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9969469308853149, \"percentile_inc_nulls\": 0.9969469308853149, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9968739748001099, \"percentile_inc_nulls\": 0.9968740344047546, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9967020750045776, \"percentile_inc_nulls\": 0.9967020750045776, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9963894486427307, \"percentile_inc_nulls\": 0.9963894486427307, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9960612058639526, \"percentile_inc_nulls\": 0.9960612654685974, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9957694411277771, \"percentile_inc_nulls\": 0.9957695007324219, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9950765371322632, \"percentile_inc_nulls\": 0.9950765371322632, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9940761923789978, \"percentile_inc_nulls\": 0.9940762519836426, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.992929995059967, \"percentile_inc_nulls\": 0.9929300546646118, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9911377429962158, \"percentile_inc_nulls\": 0.9911378026008606, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9877616763114929, \"percentile_inc_nulls\": 0.9877617359161377, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 648.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.9704748392105103, \"percentile_inc_nulls\": 0.9704750180244446, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 3318.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 5.185604095458984e-06, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 186271.0, \"distinct_value_count\": 188367}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 120, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 188367}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 1 values (0.0%) are null and there are 188367 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 120, \"group_name\": \"_company_name_\", \"value\": \"rush truck center\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 79, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 57, \"group_name\": \"_company_name_\", \"value\": \"rush peterbilt truck center\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 52, \"group_name\": \"_company_name_\", \"value\": \"branch\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 33, \"group_name\": \"_company_name_\", \"value\": \"sci funeral services llc iowa limited liability company\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 32, \"group_name\": \"_company_name_\", \"value\": \"partnership limited partnership\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 27, \"group_name\": \"_company_name_\", \"value\": \"alderwoods group llc de limited liability company\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"encompass health rehabilitation hospital of\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"u haul co of\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 25, \"group_name\": \"_company_name_\", \"value\": \"at and t\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"ingescorp 2008 s l\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"initiator qrs 14 62 incorporated\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"jamesinvest srl\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"jandoor limited liability company\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"jensen beach storage 18 limited liability company\", \"total_non_null_rows\": 191938, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 188367}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 120]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.09082931280136108, \"percentile_inc_nulls\": 0.17967689037322998, \"value_count\": 169, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08988809585571289, \"percentile_inc_nulls\": 0.17882764339447021, \"value_count\": 163, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 163.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08897000551223755, \"percentile_inc_nulls\": 0.17799925804138184, \"value_count\": 159, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08805763721466064, \"percentile_inc_nulls\": 0.17717605829238892, \"value_count\": 158, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08716261386871338, \"percentile_inc_nulls\": 0.17636853456497192, \"value_count\": 155, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08627915382385254, \"percentile_inc_nulls\": 0.17557138204574585, \"value_count\": 153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08545345067977905, \"percentile_inc_nulls\": 0.17482638359069824, \"value_count\": 143, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08464503288269043, \"percentile_inc_nulls\": 0.1740970015525818, \"value_count\": 140, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08386552333831787, \"percentile_inc_nulls\": 0.1733936071395874, \"value_count\": 135, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08310908079147339, \"percentile_inc_nulls\": 0.17271113395690918, \"value_count\": 131, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08235841989517212, \"percentile_inc_nulls\": 0.17203384637832642, \"value_count\": 130, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08161354064941406, \"percentile_inc_nulls\": 0.1713617444038391, \"value_count\": 129, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.08087444305419922, \"percentile_inc_nulls\": 0.17069482803344727, \"value_count\": 128, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0801469087600708, \"percentile_inc_nulls\": 0.1700384020805359, \"value_count\": 126, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07942509651184082, \"percentile_inc_nulls\": 0.16938716173171997, \"value_count\": 125, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07870912551879883, \"percentile_inc_nulls\": 0.1687411069869995, \"value_count\": 124, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07799887657165527, \"percentile_inc_nulls\": 0.1681002974510193, \"value_count\": 123, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07729440927505493, \"percentile_inc_nulls\": 0.16746467351913452, \"value_count\": 122, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0765957236289978, \"percentile_inc_nulls\": 0.16683423519134521, \"value_count\": 121, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07590281963348389, \"percentile_inc_nulls\": 0.16620904207229614, \"value_count\": 120, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07523876428604126, \"percentile_inc_nulls\": 0.16560989618301392, \"value_count\": 115, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07459205389022827, \"percentile_inc_nulls\": 0.16502636671066284, \"value_count\": 112, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07395690679550171, \"percentile_inc_nulls\": 0.16445326805114746, \"value_count\": 110, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07333904504776001, \"percentile_inc_nulls\": 0.163895845413208, \"value_count\": 107, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07213801145553589, \"percentile_inc_nulls\": 0.16281211376190186, \"value_count\": 104, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0715605616569519, \"percentile_inc_nulls\": 0.16229116916656494, \"value_count\": 100, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07098889350891113, \"percentile_inc_nulls\": 0.1617753505706787, \"value_count\": 99, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.07042300701141357, \"percentile_inc_nulls\": 0.16126477718353271, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06986868381500244, \"percentile_inc_nulls\": 0.1607646346092224, \"value_count\": 96, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06932014226913452, \"percentile_inc_nulls\": 0.16026967763900757, \"value_count\": 95, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06878310441970825, \"percentile_inc_nulls\": 0.15978515148162842, \"value_count\": 93, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06775528192520142, \"percentile_inc_nulls\": 0.15885776281356812, \"value_count\": 89, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.066739022731781, \"percentile_inc_nulls\": 0.1579408049583435, \"value_count\": 88, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06625401973724365, \"percentile_inc_nulls\": 0.1575031876564026, \"value_count\": 84, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06578052043914795, \"percentile_inc_nulls\": 0.1570759415626526, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06532436609268188, \"percentile_inc_nulls\": 0.15666437149047852, \"value_count\": 79, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06487393379211426, \"percentile_inc_nulls\": 0.1562579870223999, \"value_count\": 78, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06442934274673462, \"percentile_inc_nulls\": 0.15585678815841675, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06401360034942627, \"percentile_inc_nulls\": 0.15548169612884521, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06320518255233765, \"percentile_inc_nulls\": 0.15475231409072876, \"value_count\": 70, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0624314546585083, \"percentile_inc_nulls\": 0.1540541648864746, \"value_count\": 67, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.062056124210357666, \"percentile_inc_nulls\": 0.15371549129486084, \"value_count\": 65, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.06169229745864868, \"percentile_inc_nulls\": 0.15338724851608276, \"value_count\": 63, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.061334311962127686, \"percentile_inc_nulls\": 0.15306425094604492, \"value_count\": 62, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0609821081161499, \"percentile_inc_nulls\": 0.15274643898010254, \"value_count\": 61, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05994272232055664, \"percentile_inc_nulls\": 0.15180861949920654, \"value_count\": 60, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05960202217102051, \"percentile_inc_nulls\": 0.15150123834609985, \"value_count\": 59, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.058262407779693604, \"percentile_inc_nulls\": 0.1502925157546997, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05793905258178711, \"percentile_inc_nulls\": 0.15000081062316895, \"value_count\": 56, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05666869878768921, \"percentile_inc_nulls\": 0.14885461330413818, \"value_count\": 55, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05635690689086914, \"percentile_inc_nulls\": 0.14857321977615356, \"value_count\": 54, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.056062400341033936, \"percentile_inc_nulls\": 0.14830756187438965, \"value_count\": 51, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.055779457092285156, \"percentile_inc_nulls\": 0.14805227518081665, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05494797229766846, \"percentile_inc_nulls\": 0.1473020315170288, \"value_count\": 48, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.054676592350006104, \"percentile_inc_nulls\": 0.14705711603164673, \"value_count\": 47, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05442249774932861, \"percentile_inc_nulls\": 0.14682787656784058, \"value_count\": 44, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05367761850357056, \"percentile_inc_nulls\": 0.14615583419799805, \"value_count\": 43, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05295008420944214, \"percentile_inc_nulls\": 0.1454993486404419, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05200308561325073, \"percentile_inc_nulls\": 0.14464491605758667, \"value_count\": 41, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.051541149616241455, \"percentile_inc_nulls\": 0.14422810077667236, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.051090776920318604, \"percentile_inc_nulls\": 0.14382171630859375, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.05021309852600098, \"percentile_inc_nulls\": 0.1430298089981079, \"value_count\": 38, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0499994158744812, \"percentile_inc_nulls\": 0.1428370475769043, \"value_count\": 37, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 37.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.049791574478149414, \"percentile_inc_nulls\": 0.14264947175979614, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04939889907836914, \"percentile_inc_nulls\": 0.14229518175125122, \"value_count\": 34, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04920834302902222, \"percentile_inc_nulls\": 0.142123281955719, \"value_count\": 33, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04885035753250122, \"percentile_inc_nulls\": 0.14180028438568115, \"value_count\": 31, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.048503875732421875, \"percentile_inc_nulls\": 0.14148765802383423, \"value_count\": 30, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04783409833908081, \"percentile_inc_nulls\": 0.14088332653045654, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04751068353652954, \"percentile_inc_nulls\": 0.140591561794281, \"value_count\": 28, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.046887099742889404, \"percentile_inc_nulls\": 0.14002883434295654, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.046286582946777344, \"percentile_inc_nulls\": 0.13948702812194824, \"value_count\": 26, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04542040824890137, \"percentile_inc_nulls\": 0.1387055516242981, \"value_count\": 25, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0451432466506958, \"percentile_inc_nulls\": 0.13845545053482056, \"value_count\": 24, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04421359300613403, \"percentile_inc_nulls\": 0.13761663436889648, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04332435131072998, \"percentile_inc_nulls\": 0.13681429624557495, \"value_count\": 22, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04296058416366577, \"percentile_inc_nulls\": 0.13648605346679688, \"value_count\": 21, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.042614102363586426, \"percentile_inc_nulls\": 0.13617348670959473, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.041517019271850586, \"percentile_inc_nulls\": 0.13518357276916504, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.04068553447723389, \"percentile_inc_nulls\": 0.1344333291053772, \"value_count\": 18, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.039900243282318115, \"percentile_inc_nulls\": 0.13372474908828735, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.039161086082458496, \"percentile_inc_nulls\": 0.13305789232254028, \"value_count\": 16, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.03803515434265137, \"percentile_inc_nulls\": 0.13204193115234375, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.03690338134765625, \"percentile_inc_nulls\": 0.13102078437805176, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.035927534103393555, \"percentile_inc_nulls\": 0.1301403045654297, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.03447240591049194, \"percentile_inc_nulls\": 0.12882739305496216, \"value_count\": 12, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.03345614671707153, \"percentile_inc_nulls\": 0.12791043519973755, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.03218579292297363, \"percentile_inc_nulls\": 0.1267642378807068, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.030782639980316162, \"percentile_inc_nulls\": 0.12549817562103271, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.029443025588989258, \"percentile_inc_nulls\": 0.12428951263427734, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.027866661548614502, \"percentile_inc_nulls\": 0.1228671669960022, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.025857210159301758, \"percentile_inc_nulls\": 0.12105411291122437, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.023258745670318604, \"percentile_inc_nulls\": 0.11870956420898438, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.020371615886688232, \"percentile_inc_nulls\": 0.1161046028137207, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.016785800457000732, \"percentile_inc_nulls\": 0.11286920309066772, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.011473476886749268, \"percentile_inc_nulls\": 0.10807597637176514, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.09772378206253052, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1987.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.6348696947097778, \"percentile_inc_nulls\": 0.6705515384674072, \"value_count\": 63234, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 63234.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.6021872758865356, \"percentile_inc_nulls\": 0.6410630345344543, \"value_count\": 5660, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 5660.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5841195583343506, \"percentile_inc_nulls\": 0.6247609853744507, \"value_count\": 3129, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 3129.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5667967796325684, \"percentile_inc_nulls\": 0.6091310381889343, \"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 3000.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5504555702209473, \"percentile_inc_nulls\": 0.5943867564201355, \"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2830.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5343049764633179, \"percentile_inc_nulls\": 0.5798144340515137, \"value_count\": 2797, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2797.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5183506011962891, \"percentile_inc_nulls\": 0.5654191970825195, \"value_count\": 2763, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2763.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.5036782026290894, \"percentile_inc_nulls\": 0.5521806478500366, \"value_count\": 2541, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2541.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.4890981912612915, \"percentile_inc_nulls\": 0.5390254259109497, \"value_count\": 2525, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2525.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.475580632686615, \"percentile_inc_nulls\": 0.5268288254737854, \"value_count\": 2341, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2341.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.46294069290161133, \"percentile_inc_nulls\": 0.5154241323471069, \"value_count\": 2189, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2189.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.45034700632095337, \"percentile_inc_nulls\": 0.504061222076416, \"value_count\": 2181, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2181.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.4387292265892029, \"percentile_inc_nulls\": 0.4935786724090576, \"value_count\": 2012, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 2012.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.4272788166999817, \"percentile_inc_nulls\": 0.4832472801208496, \"value_count\": 1983, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1983.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.41723155975341797, \"percentile_inc_nulls\": 0.4741818904876709, \"value_count\": 1740, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1740.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.40758275985717773, \"percentile_inc_nulls\": 0.46547603607177734, \"value_count\": 1671, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1671.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.39799749851226807, \"percentile_inc_nulls\": 0.45682740211486816, \"value_count\": 1660, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1660.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.38937652111053467, \"percentile_inc_nulls\": 0.4490489363670349, \"value_count\": 1493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1493.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.38093453645706177, \"percentile_inc_nulls\": 0.4414319396018982, \"value_count\": 1462, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1462.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.37274080514907837, \"percentile_inc_nulls\": 0.4340389370918274, \"value_count\": 1419, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1419.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.36468571424484253, \"percentile_inc_nulls\": 0.42677098512649536, \"value_count\": 1395, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1395.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.35694819688796997, \"percentile_inc_nulls\": 0.4197896122932434, \"value_count\": 1340, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1340.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.3492279648780823, \"percentile_inc_nulls\": 0.4128238558769226, \"value_count\": 1337, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1337.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.34196972846984863, \"percentile_inc_nulls\": 0.4062749147415161, \"value_count\": 1257, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1257.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.33505213260650635, \"percentile_inc_nulls\": 0.4000333547592163, \"value_count\": 1198, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1198.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.32839441299438477, \"percentile_inc_nulls\": 0.3940262198448181, \"value_count\": 1153, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1153.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.32215243577957153, \"percentile_inc_nulls\": 0.38839423656463623, \"value_count\": 1081, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1081.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.3159450888633728, \"percentile_inc_nulls\": 0.38279348611831665, \"value_count\": 1075, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1075.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.3097723722457886, \"percentile_inc_nulls\": 0.37722402811050415, \"value_count\": 1069, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1069.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.3036343455314636, \"percentile_inc_nulls\": 0.37168580293655396, \"value_count\": 1063, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2977503538131714, \"percentile_inc_nulls\": 0.3663768172264099, \"value_count\": 1019, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2919183373451233, \"percentile_inc_nulls\": 0.3611147403717041, \"value_count\": 1010, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.28632885217666626, \"percentile_inc_nulls\": 0.35607147216796875, \"value_count\": 968, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 968.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.28077399730682373, \"percentile_inc_nulls\": 0.3510594367980957, \"value_count\": 962, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 962.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2752307057380676, \"percentile_inc_nulls\": 0.3460578918457031, \"value_count\": 960, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 960.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.27040916681289673, \"percentile_inc_nulls\": 0.3417075276374817, \"value_count\": 835, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 835.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2656973600387573, \"percentile_inc_nulls\": 0.3374561667442322, \"value_count\": 816, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 816.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.26118767261505127, \"percentile_inc_nulls\": 0.33338719606399536, \"value_count\": 781, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 781.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.25678765773773193, \"percentile_inc_nulls\": 0.3294171690940857, \"value_count\": 762, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 762.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.252399206161499, \"percentile_inc_nulls\": 0.3254575729370117, \"value_count\": 760, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.24801653623580933, \"percentile_inc_nulls\": 0.3215031623840332, \"value_count\": 759, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 759.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.24395722150802612, \"percentile_inc_nulls\": 0.317840576171875, \"value_count\": 703, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 703.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.23995566368103027, \"percentile_inc_nulls\": 0.3142300248146057, \"value_count\": 693, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 693.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.23601758480072021, \"percentile_inc_nulls\": 0.31067681312561035, \"value_count\": 682, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 682.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.23232203722000122, \"percentile_inc_nulls\": 0.30734241008758545, \"value_count\": 640, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2286553978919983, \"percentile_inc_nulls\": 0.30403411388397217, \"value_count\": 635, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 635.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.22500604391098022, \"percentile_inc_nulls\": 0.30074137449264526, \"value_count\": 632, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 632.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2214837670326233, \"percentile_inc_nulls\": 0.2975633144378662, \"value_count\": 610, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 610.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.21799033880233765, \"percentile_inc_nulls\": 0.29441124200820923, \"value_count\": 605, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 605.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.21463549137115479, \"percentile_inc_nulls\": 0.291384220123291, \"value_count\": 581, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.21146541833877563, \"percentile_inc_nulls\": 0.28852397203445435, \"value_count\": 549, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 549.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2083357572555542, \"percentile_inc_nulls\": 0.28570014238357544, \"value_count\": 542, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 542.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.20545434951782227, \"percentile_inc_nulls\": 0.2831003665924072, \"value_count\": 499, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 499.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.2026076316833496, \"percentile_inc_nulls\": 0.2805318236351013, \"value_count\": 493, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 493.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.19980138540267944, \"percentile_inc_nulls\": 0.27799975872039795, \"value_count\": 486, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 486.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.19443130493164062, \"percentile_inc_nulls\": 0.27315449714660645, \"value_count\": 465, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 930.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1918848156929016, \"percentile_inc_nulls\": 0.2708568572998047, \"value_count\": 441, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 441.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1893787980079651, \"percentile_inc_nulls\": 0.26859575510025024, \"value_count\": 434, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.18691319227218628, \"percentile_inc_nulls\": 0.26637107133865356, \"value_count\": 427, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.18447065353393555, \"percentile_inc_nulls\": 0.26416724920272827, \"value_count\": 423, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.18205124139785767, \"percentile_inc_nulls\": 0.26198428869247437, \"value_count\": 419, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 419.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.17964917421340942, \"percentile_inc_nulls\": 0.2598169445991516, \"value_count\": 416, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.17725861072540283, \"percentile_inc_nulls\": 0.2576599717140198, \"value_count\": 414, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1748969554901123, \"percentile_inc_nulls\": 0.25552910566329956, \"value_count\": 409, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 409.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.17254102230072021, \"percentile_inc_nulls\": 0.2534034252166748, \"value_count\": 408, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.16810637712478638, \"percentile_inc_nulls\": 0.24940216541290283, \"value_count\": 384, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 768.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1659756898880005, \"percentile_inc_nulls\": 0.24747967720031738, \"value_count\": 369, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 369.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1639488935470581, \"percentile_inc_nulls\": 0.24565094709396362, \"value_count\": 351, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.15795522928237915, \"percentile_inc_nulls\": 0.24024301767349243, \"value_count\": 346, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 1038.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.15596884489059448, \"percentile_inc_nulls\": 0.2384507656097412, \"value_count\": 344, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.15399408340454102, \"percentile_inc_nulls\": 0.23666894435882568, \"value_count\": 342, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1520366072654724, \"percentile_inc_nulls\": 0.2349027395248413, \"value_count\": 339, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.15009641647338867, \"percentile_inc_nulls\": 0.23315221071243286, \"value_count\": 336, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.14823132753372192, \"percentile_inc_nulls\": 0.23146939277648926, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.14638358354568481, \"percentile_inc_nulls\": 0.2298021912574768, \"value_count\": 320, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.14454156160354614, \"percentile_inc_nulls\": 0.22814017534255981, \"value_count\": 319, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 319.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.14272844791412354, \"percentile_inc_nulls\": 0.22650426626205444, \"value_count\": 314, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 314.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.14093846082687378, \"percentile_inc_nulls\": 0.22488915920257568, \"value_count\": 310, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.137462317943573, \"percentile_inc_nulls\": 0.22175276279449463, \"value_count\": 301, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 602.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.13573580980300903, \"percentile_inc_nulls\": 0.22019493579864502, \"value_count\": 299, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 299.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1340208649635315, \"percentile_inc_nulls\": 0.21864759922027588, \"value_count\": 297, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.13060247898101807, \"percentile_inc_nulls\": 0.21556329727172852, \"value_count\": 296, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1289510726928711, \"percentile_inc_nulls\": 0.21407324075698853, \"value_count\": 286, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.12732845544815063, \"percentile_inc_nulls\": 0.21260923147201538, \"value_count\": 281, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 281.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1257232427597046, \"percentile_inc_nulls\": 0.2111608386039734, \"value_count\": 278, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.12418150901794434, \"percentile_inc_nulls\": 0.20976978540420532, \"value_count\": 267, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.1227494478225708, \"percentile_inc_nulls\": 0.2084776759147644, \"value_count\": 248, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11990851163864136, \"percentile_inc_nulls\": 0.20591437816619873, \"value_count\": 246, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11854583024978638, \"percentile_inc_nulls\": 0.2046847939491272, \"value_count\": 236, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11724084615707397, \"percentile_inc_nulls\": 0.20350736379623413, \"value_count\": 226, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11594158411026001, \"percentile_inc_nulls\": 0.20233511924743652, \"value_count\": 225, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11465972661972046, \"percentile_inc_nulls\": 0.20117849111557007, \"value_count\": 222, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11338359117507935, \"percentile_inc_nulls\": 0.20002710819244385, \"value_count\": 221, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.11091220378875732, \"percentile_inc_nulls\": 0.1977972388267517, \"value_count\": 214, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10968232154846191, \"percentile_inc_nulls\": 0.1966874599456787, \"value_count\": 213, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10845816135406494, \"percentile_inc_nulls\": 0.19558298587799072, \"value_count\": 212, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10725134611129761, \"percentile_inc_nulls\": 0.1944940686225891, \"value_count\": 209, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10606759786605835, \"percentile_inc_nulls\": 0.1934260129928589, \"value_count\": 205, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10490697622299194, \"percentile_inc_nulls\": 0.19237881898880005, \"value_count\": 201, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10375213623046875, \"percentile_inc_nulls\": 0.19133681058883667, \"value_count\": 200, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10145395994186401, \"percentile_inc_nulls\": 0.18926328420639038, \"value_count\": 199, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 398.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.10031640529632568, \"percentile_inc_nulls\": 0.18823689222335815, \"value_count\": 197, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.09918469190597534, \"percentile_inc_nulls\": 0.18721574544906616, \"value_count\": 196, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.09695577621459961, \"percentile_inc_nulls\": 0.18520468473434448, \"value_count\": 193, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.09483087062835693, \"percentile_inc_nulls\": 0.1832873821258545, \"value_count\": 184, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.09379148483276367, \"percentile_inc_nulls\": 0.18234962224960327, \"value_count\": 180, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.0927925705909729, \"percentile_inc_nulls\": 0.18144828081130981, \"value_count\": 173, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 0.09180516004562378, \"percentile_inc_nulls\": 0.18055737018585205, \"value_count\": 171, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 3414}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 169, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 3414}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 18,757 values (9.8%) are null and there are 3414 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 63234, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 5660, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"united kingdom\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 3129, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 3000, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"netherlands\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2830, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"germany\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2797, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2763, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"cayman islands\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2541, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"china\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2525, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 2341, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"australia\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"united states oregon\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"united states delaware united states delaware\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"puerto rico, united states territory\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"channel islands\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"limited vietnam\", \"total_non_null_rows\": 173182, \"total_rows_inc_nulls\": 191939, \"distinct_value_count\": 3414}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 63234]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 95,
+     "execution_count": 78,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1566,7 +1453,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 79,
    "id": "2a57f717-140f-434d-8998-983b8bf38ac5",
    "metadata": {
     "tags": []
@@ -1577,23 +1464,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed {\n",
+       "  #altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed details,\n",
-       "  #altair-viz-df3493bc8ab644cd88e67dc7251d7ca5.vega-embed details summary {\n",
+       "  #altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b.vega-embed details,\n",
+       "  #altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\"></div>\n",
+       "<div id=\"altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-df3493bc8ab644cd88e67dc7251d7ca5\");\n",
+       "    if (outputDiv.id !== \"altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-d7a7cb9ed31b4cfb8ae1bea0e71c626b\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1639,14 +1526,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 7107, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7107.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 7107, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7107.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7107, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7107, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 7107]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9983115196228027, \"percentile_inc_nulls\": 0.9983115196228027, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 7095.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 7101 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"bill holdings, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"aclarion, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"augusta gold corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"anavex life sciences corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"inhibikase therapeutics, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"magenta therapeutics, incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"waters corp /de/\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"core laboratories n v\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"optical cable corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"blonder tongue laboratories incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"waters corp /de/\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"core laboratories n v\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"optical cable corporation\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"blonder tongue laboratories incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"novavax incorporated\", \"total_non_null_rows\": 7107, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.41720396280288696, \"percentile_inc_nulls\": 0.4785422682762146, \"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 3706.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.31655919551849365, \"percentile_inc_nulls\": 0.3884902000427246, \"value_count\": 640, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.26576507091522217, \"percentile_inc_nulls\": 0.3430420756340027, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.23572885990142822, \"percentile_inc_nulls\": 0.3161671757698059, \"value_count\": 191, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.21811604499816895, \"percentile_inc_nulls\": 0.3004080653190613, \"value_count\": 112, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.20113223791122437, \"percentile_inc_nulls\": 0.2852117419242859, \"value_count\": 108, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.1857210397720337, \"percentile_inc_nulls\": 0.2714225649833679, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.17282593250274658, \"percentile_inc_nulls\": 0.2598845958709717, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.16071707010269165, \"percentile_inc_nulls\": 0.24905025959014893, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.14939457178115845, \"percentile_inc_nulls\": 0.23891937732696533, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.1390155553817749, \"percentile_inc_nulls\": 0.22963273525238037, \"value_count\": 66, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12989461421966553, \"percentile_inc_nulls\": 0.22147178649902344, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12218904495239258, \"percentile_inc_nulls\": 0.21457719802856445, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.11479794979095459, \"percentile_inc_nulls\": 0.2079640030860901, \"value_count\": 47, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10787862539291382, \"percentile_inc_nulls\": 0.20177292823791504, \"value_count\": 44, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10127377510070801, \"percentile_inc_nulls\": 0.1958632469177246, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08869320154190063, \"percentile_inc_nulls\": 0.18460673093795776, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08256012201309204, \"percentile_inc_nulls\": 0.17911916971206665, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07689887285232544, \"percentile_inc_nulls\": 0.17405372858047485, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07139486074447632, \"percentile_inc_nulls\": 0.16912901401519775, \"value_count\": 35, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.06683439016342163, \"percentile_inc_nulls\": 0.1650485396385193, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05834251642227173, \"percentile_inc_nulls\": 0.1574503779411316, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.054725587368011475, \"percentile_inc_nulls\": 0.15421414375305176, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05158042907714844, \"percentile_inc_nulls\": 0.15140002965927124, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.048592567443847656, \"percentile_inc_nulls\": 0.14872658252716064, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.043245792388916016, \"percentile_inc_nulls\": 0.14394259452819824, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 34.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.036169230937957764, \"percentile_inc_nulls\": 0.1376107931137085, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.03176599740982056, \"percentile_inc_nulls\": 0.13367104530334473, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 28.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.027677297592163086, \"percentile_inc_nulls\": 0.13001269102096558, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.024217665195465088, \"percentile_inc_nulls\": 0.12691712379455566, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.02107250690460205, \"percentile_inc_nulls\": 0.12410300970077515, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0196571946144104, \"percentile_inc_nulls\": 0.1228366494178772, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.01839911937713623, \"percentile_inc_nulls\": 0.12171101570129395, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 8.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.016197502613067627, \"percentile_inc_nulls\": 0.11974108219146729, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.01431041955947876, \"percentile_inc_nulls\": 0.11805260181427002, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.011165261268615723, \"percentile_inc_nulls\": 0.1152384877204895, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.007391095161437988, \"percentile_inc_nulls\": 0.11186152696609497, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.004088699817657471, \"percentile_inc_nulls\": 0.10890668630599976, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0015725493431091309, \"percentile_inc_nulls\": 0.10665541887283325, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.1052483320236206, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"sum_tokens_in_value_count_group\": 3706.0, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 748 values (10.5%) are null and there are 81 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3706, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 640, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"nevada\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"maryland\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 191, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"e9\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 112, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 108, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"new york\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"north carolina\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"pennsylvania\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"t3\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"f4\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"c5\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"p8\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"a3\", \"total_non_null_rows\": 6359, \"total_rows_inc_nulls\": 7107, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3706]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 7131, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 7131.0, \"distinct_value_count\": 1}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 7131, \"group_name\": \"_report_year_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 7131.0, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"report_year\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 1 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7131, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 1 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7131, \"group_name\": \"_report_year_\", \"value\": \"2023\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 1}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 7131]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 1 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9995793104171753, \"percentile_inc_nulls\": 0.9995793104171753, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 0.991726279258728, \"percentile_inc_nulls\": 0.991726279258728, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 7072.0, \"distinct_value_count\": 7101}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 7101 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3, \"group_name\": \"_company_name_\", \"value\": \"planet green holdings corporation\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"data443 risk mitigation incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"ameriguard security services incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"entrada therapeutics incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"perception capital corp ii\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"qdm international incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"fingermotion incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"electrameccanica vehicles corporation\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"mountain top properties incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 2, \"group_name\": \"_company_name_\", \"value\": \"magenta therapeutics incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"mativ holdings incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"nicholas financial incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"intevac incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"impac mortgage holdings incorporated\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"tucson electric power company\", \"total_non_null_rows\": 7131, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 7101}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.41789406538009644, \"percentile_inc_nulls\": 0.4790351986885071, \"value_count\": 3715, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 3715.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.3163585066795349, \"percentile_inc_nulls\": 0.38816434144973755, \"value_count\": 648, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 648.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.26574742794036865, \"percentile_inc_nulls\": 0.3428691625595093, \"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.2356628179550171, \"percentile_inc_nulls\": 0.31594449281692505, \"value_count\": 192, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.2179567813873291, \"percentile_inc_nulls\": 0.300098180770874, \"value_count\": 113, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.20087748765945435, \"percentile_inc_nulls\": 0.2848128080368042, \"value_count\": 109, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.18552178144454956, \"percentile_inc_nulls\": 0.2710700035095215, \"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.17267316579818726, \"percentile_inc_nulls\": 0.2595708966255188, \"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.1606079339981079, \"percentile_inc_nulls\": 0.2487729787826538, \"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.14932620525360107, \"percentile_inc_nulls\": 0.23867619037628174, \"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.13898462057113647, \"percentile_inc_nulls\": 0.22942084074020386, \"value_count\": 66, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12989658117294312, \"percentile_inc_nulls\": 0.2212873101234436, \"value_count\": 58, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.12221872806549072, \"percentile_inc_nulls\": 0.21441590785980225, \"value_count\": 49, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.11469757556915283, \"percentile_inc_nulls\": 0.20768475532531738, \"value_count\": 48, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10764652490615845, \"percentile_inc_nulls\": 0.20137429237365723, \"value_count\": 45, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.10106551647186279, \"percentile_inc_nulls\": 0.19548451900482178, \"value_count\": 42, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08853024244308472, \"percentile_inc_nulls\": 0.1842658519744873, \"value_count\": 40, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.08241927623748779, \"percentile_inc_nulls\": 0.17879682779312134, \"value_count\": 39, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07677841186523438, \"percentile_inc_nulls\": 0.1737484335899353, \"value_count\": 36, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.07129424810409546, \"percentile_inc_nulls\": 0.16884028911590576, \"value_count\": 35, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.06675022840499878, \"percentile_inc_nulls\": 0.16477352380752563, \"value_count\": 29, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05828893184661865, \"percentile_inc_nulls\": 0.15720093250274658, \"value_count\": 27, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.05468505620956421, \"percentile_inc_nulls\": 0.15397560596466064, \"value_count\": 23, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.051551222801208496, \"percentile_inc_nulls\": 0.15117096900939941, \"value_count\": 20, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.048574090003967285, \"percentile_inc_nulls\": 0.1485065221786499, \"value_count\": 19, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.04324662685394287, \"percentile_inc_nulls\": 0.14373862743377686, \"value_count\": 17, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 34.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.036195576190948486, \"percentile_inc_nulls\": 0.13742810487747192, \"value_count\": 15, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.03180819749832153, \"percentile_inc_nulls\": 0.1335015892982483, \"value_count\": 14, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 28.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.02773427963256836, \"percentile_inc_nulls\": 0.12985557317733765, \"value_count\": 13, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.024287045001983643, \"percentile_inc_nulls\": 0.12677043676376343, \"value_count\": 11, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.021153271198272705, \"percentile_inc_nulls\": 0.1239657998085022, \"value_count\": 10, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.019743025302886963, \"percentile_inc_nulls\": 0.1227036714553833, \"value_count\": 9, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.018489480018615723, \"percentile_inc_nulls\": 0.12158185243606567, \"value_count\": 8, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 8.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.01629585027694702, \"percentile_inc_nulls\": 0.11961859464645386, \"value_count\": 7, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 14.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.013475418090820312, \"percentile_inc_nulls\": 0.11709439754486084, \"value_count\": 6, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.011125028133392334, \"percentile_inc_nulls\": 0.11499089002609253, \"value_count\": 5, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.007364451885223389, \"percentile_inc_nulls\": 0.1116253137588501, \"value_count\": 4, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.004073977470397949, \"percentile_inc_nulls\": 0.10868042707443237, \"value_count\": 3, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0015668869018554688, \"percentile_inc_nulls\": 0.10643666982650757, \"value_count\": 2, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.10503435134887695, \"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 81}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3715, \"group_name\": \"_loc_of_incorporation_\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"sum_tokens_in_value_count_group\": 3715.0, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"loc_of_incorporation\\\"\", \"subtitle\": \"In this col, 749 values (10.5%) are null and there are 81 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3715, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"delaware\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 648, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"nevada\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 323, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"maryland\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 192, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"e9\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 113, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"florida\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 109, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"new york\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 98, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"north carolina\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 82, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"pennsylvania\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 77, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"texas\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 72, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"california\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"c5\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"p8\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"2m\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"k3\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}, {\"value_count\": 1, \"group_name\": \"_loc_of_incorporation_\", \"value\": \"f4\", \"total_non_null_rows\": 6382, \"total_rows_inc_nulls\": 7131, \"distinct_value_count\": 81}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3715]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 96,
+     "execution_count": 79,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1671,7 +1558,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 36,
    "id": "fb6d143b-5201-4b31-849c-97db80781ade",
    "metadata": {
     "tags": []
@@ -1684,19 +1571,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 80,
    "id": "22766c9f-7371-483f-82b0-015549a84357",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "br = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\""
+    "br = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 81,
    "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6",
    "metadata": {
     "tags": []
@@ -1705,14 +1592,14 @@
     {
      "data": {
       "text/plain": [
-       "{'number_of_comparisons_generated_pre_filter_conditions': 2069828,\n",
-       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 2069828,\n",
+       "{'number_of_comparisons_generated_pre_filter_conditions': 531298,\n",
+       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 531298,\n",
        " 'filter_conditions_identified': '',\n",
-       " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 3) = SUBSTRING(r.company_name_mphone, 1, 3)',\n",
+       " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
        " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
       ]
      },
-     "execution_count": 104,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1735,7 +1622,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 82,
    "id": "67717313-2c17-4b6b-b984-8f7bc955c678",
    "metadata": {
     "tags": []
@@ -1763,7 +1650,6 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>key_0</th>\n",
-       "      <th>key_1</th>\n",
        "      <th>count_l</th>\n",
        "      <th>count_r</th>\n",
        "      <th>block_count</th>\n",
@@ -1772,40 +1658,37 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2023</td>\n",
-       "      <td>STR</td>\n",
-       "      <td>68</td>\n",
-       "      <td>1297</td>\n",
-       "      <td>88196</td>\n",
+       "      <td>AMRK</td>\n",
+       "      <td>56</td>\n",
+       "      <td>625</td>\n",
+       "      <td>35000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2023</td>\n",
-       "      <td>INT</td>\n",
-       "      <td>62</td>\n",
-       "      <td>1275</td>\n",
-       "      <td>79050</td>\n",
+       "      <td>FRST</td>\n",
+       "      <td>56</td>\n",
+       "      <td>555</td>\n",
+       "      <td>31080</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>2023</td>\n",
-       "      <td>KRN</td>\n",
-       "      <td>60</td>\n",
-       "      <td>1290</td>\n",
-       "      <td>77400</td>\n",
+       "      <td>INTR</td>\n",
+       "      <td>30</td>\n",
+       "      <td>659</td>\n",
+       "      <td>19770</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   key_0 key_1  count_l  count_r  block_count\n",
-       "0   2023   STR       68     1297        88196\n",
-       "1   2023   INT       62     1275        79050\n",
-       "2   2023   KRN       60     1290        77400"
+       "  key_0  count_l  count_r  block_count\n",
+       "0  AMRK       56      625        35000\n",
+       "1  FRST       56      555        31080\n",
+       "2  INTR       30      659        19770"
       ]
      },
-     "execution_count": 106,
+     "execution_count": 82,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1824,7 +1707,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 84,
    "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c",
    "metadata": {},
    "outputs": [
@@ -1833,23 +1716,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed {\n",
+       "  #altair-viz-e53902624c01436b99966f7e6249a8aa.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed details,\n",
-       "  #altair-viz-386d63fed4c940d1bd4e29a1fc26421c.vega-embed details summary {\n",
+       "  #altair-viz-e53902624c01436b99966f7e6249a8aa.vega-embed details,\n",
+       "  #altair-viz-e53902624c01436b99966f7e6249a8aa.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\"></div>\n",
+       "<div id=\"altair-viz-e53902624c01436b99966f7e6249a8aa\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-386d63fed4c940d1bd4e29a1fc26421c\");\n",
+       "    if (outputDiv.id !== \"altair-viz-e53902624c01436b99966f7e6249a8aa\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-e53902624c01436b99966f7e6249a8aa\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1895,14 +1778,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-da90c619ab0a310af714d4034b6664f8\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-da90c619ab0a310af714d4034b6664f8\": [{\"blocking_rule\": \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\", \"row_count\": 2069828, \"cumulative_rows\": 2069828, \"cartesian\": 1365709548, \"match_key\": \"0\", \"start\": 0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-56db78185748bd6b3cd7818d9e04ceb5\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-56db78185748bd6b3cd7818d9e04ceb5\": [{\"blocking_rule\": \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\", \"row_count\": 531298, \"cumulative_rows\": 531298, \"cartesian\": 1368717009, \"match_key\": \"0\", \"start\": 0}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.Chart(...)"
       ]
      },
-     "execution_count": 107,
+     "execution_count": 84,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1913,8 +1796,7 @@
     ")\n",
     "\n",
     "blocking_rules_for_analysis = [\n",
-    "    # block_on(\"substr(l.company_name_mphone,1,3)\", \"substr(r.company_name_mphone,1,3)\"),\n",
-    "    \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n",
+    "    br\n",
     "]\n",
     "\n",
     "\n",
@@ -1939,7 +1821,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 44,
    "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5",
    "metadata": {
     "tags": []
@@ -1952,54 +1834,104 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
-   "id": "bb13b160-b554-45d6-a575-5fa2de061350",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 120,
+   "id": "e9cf27ac-6f65-4c73-9e11-9445a8977531",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Comparison 'NameComparison' of \"company_name\".\n",
+      "Comparison 'ExactMatch' of \"company_name\".\n",
       "Similarity is assessed using the following ComparisonLevels:\n",
       "    - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
       "    - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
-      "    - 'Jaro-Winkler distance of company_name >= 0.92' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.92\n",
-      "    - 'Jaro-Winkler distance of company_name >= 0.88' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.88\n",
-      "    - 'Jaro-Winkler distance of company_name >= 0.7' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.7\n",
       "    - 'All other comparisons' with SQL rule: ELSE\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "company_name_comparison = cl.NameComparison(\n",
+    "company_name_comparison = cl.ExactMatch(\n",
     "    \"company_name\",\n",
-    "    # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n",
     ")\n",
     "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
-   "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 85,
+   "id": "a0d056b4-b7b5-4f01-ad60-3ffc2bec54eb",
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n",
+      "Comparison 'LevenshteinAtThresholds' of \"company_name\".\n",
       "Similarity is assessed using the following ComparisonLevels:\n",
-      "    - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n",
+      "    - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
+      "    - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
+      "    - 'Levenshtein distance of company_name <= 1' with SQL rule: levenshtein(\"company_name_l\", \"company_name_r\") <= 1\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "company_name_comparison = cl.LevenshteinAtThresholds(\n",
+    "    \"company_name\",\n",
+    "    distance_threshold_or_thresholds=[1]\n",
+    ")\n",
+    "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "bf199c98-5239-4a1e-8856-19d74e42b7db",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'ArrayIntersectAtSizes' of \"company_name_mphone_list\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'company_name_mphone_list is NULL' with SQL rule: \"company_name_mphone_list_l\" IS NULL OR \"company_name_mphone_list_r\" IS NULL\n",
+      "    - 'Array intersection size >= 3' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 3\n",
+      "    - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 2\n",
+      "    - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 1\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "company_name_comparison = cl.ArrayIntersectAtSizes(\n",
+    "    \"company_name_mphone_list\",\n",
+    "    size_threshold_or_thresholds=[3,2,1]\n",
+    ")\n",
+    "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n",
       "    - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n",
       "    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n",
-      "    - 'Jaro-Winkler distance of loc_of_incorporation >= 0.7' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.7\n",
       "    - 'All other comparisons' with SQL rule: ELSE\n",
       "\n"
      ]
@@ -2009,13 +1941,42 @@
     "# try with Levenshtein too\n",
     "location_comparison = cl.JaroWinklerAtThresholds(\n",
     "    \"loc_of_incorporation\",\n",
+    "    score_threshold_or_thresholds=[0.9]\n",
+    ")\n",
+    "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 121,
+   "id": "f3529a5a-7ced-46dd-af22-7bb44ed92aa2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'ArrayIntersectAtSizes' of \"loc_list\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'loc_list is NULL' with SQL rule: \"loc_list_l\" IS NULL OR \"loc_list_r\" IS NULL\n",
+      "    - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 2\n",
+      "    - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 1\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "location_comparison = cl.ArrayIntersectAtSizes(\n",
+    "    \"loc_list\",\n",
+    "    size_threshold_or_thresholds=[2,1]\n",
     ")\n",
     "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 123,
    "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b",
    "metadata": {
     "tags": []
@@ -2048,7 +2009,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 124,
    "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608",
    "metadata": {
     "tags": []
@@ -2057,7 +2018,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f8061ccbd73c426daa2d35dbf68e55fb",
+       "model_id": "d8daffbf12a14f72a247e47fc2fa719a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2072,25 +2033,25 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Probability two random records match is estimated to be  0.000689.\n",
-      "This means that amongst all possible pairwise record comparisons, one in 1,452.36 are expected to match.  With 1,365,709,548 total possible comparisons, we expect a total of around 940,336.47 matching pairs\n"
+      "Probability two random records match is estimated to be  8.21e-05.\n",
+      "This means that amongst all possible pairwise record comparisons, one in 12,184.39 are expected to match.  With 1,368,717,009 total possible comparisons, we expect a total of around 112,333.68 matching pairs\n"
      ]
     }
    ],
    "source": [
     "deterministic_rules = [\n",
     "    block_on(\"company_name_mphone\", \"company_name_mphone\"),\n",
-    "    \"jaccard(r.company_name, l.company_name) >= .9 and l.loc_of_incorporation = r.loc_of_incorporation\",\n",
-    "    \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .8\",\n",
+    "    \"jaccard(r.company_name, l.company_name) >= .95 and l.loc_of_incorporation = r.loc_of_incorporation\",\n",
+    "    \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .95\",\n",
     "    # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n",
     "]\n",
     "\n",
-    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.85)"
+    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 125,
    "id": "5117653e-e72b-4c13-b923-d1228b39d357",
    "metadata": {
     "tags": []
@@ -2100,27 +2061,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "----- Estimating u probabilities using random sampling -----\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e732ac0702e4459b82b86d2de5c9d9fc",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "----- Estimating u probabilities using random sampling -----\n",
       "\n",
       "Estimated u probabilities using random sampling\n",
       "\n",
@@ -2136,7 +2077,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 126,
    "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9",
    "metadata": {
     "tags": []
@@ -2158,31 +2099,31 @@
       "\n",
       "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
       "\n",
-      "Iteration 1: Largest change in params was -0.213 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n",
-      "Iteration 2: Largest change in params was 0.243 in the m_probability of loc_of_incorporation, level `All other comparisons`\n",
-      "Iteration 3: Largest change in params was 0.0314 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.88`\n",
-      "Iteration 4: Largest change in params was 0.0052 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 5: Largest change in params was 0.0087 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 6: Largest change in params was 0.0133 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 7: Largest change in params was 0.0188 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 8: Largest change in params was 0.0246 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 9: Largest change in params was 0.0297 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 10: Largest change in params was 0.0332 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 11: Largest change in params was 0.0346 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 12: Largest change in params was 0.0336 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 13: Largest change in params was 0.0306 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 14: Largest change in params was 0.0264 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 15: Largest change in params was 0.0218 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 16: Largest change in params was 0.0173 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 17: Largest change in params was 0.0134 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 18: Largest change in params was 0.0102 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 19: Largest change in params was 0.00758 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 20: Largest change in params was 0.00559 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 21: Largest change in params was 0.00409 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 22: Largest change in params was 0.00298 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 23: Largest change in params was 0.00216 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 24: Largest change in params was 0.00156 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
-      "Iteration 25: Largest change in params was 0.00112 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n",
+      "Iteration 1: Largest change in params was -0.38 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n",
+      "Iteration 2: Largest change in params was 0.027 in the m_probability of loc_of_incorporation, level `All other comparisons`\n",
+      "Iteration 3: Largest change in params was -0.000274 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 4: Largest change in params was -0.00056 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 5: Largest change in params was 0.00112 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 6: Largest change in params was 0.00214 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 7: Largest change in params was 0.00387 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 8: Largest change in params was -0.00648 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 9: Largest change in params was 0.00989 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 10: Largest change in params was 0.0137 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 11: Largest change in params was 0.0171 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 12: Largest change in params was -0.0197 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 13: Largest change in params was 0.0209 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 14: Largest change in params was -0.0209 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 15: Largest change in params was -0.0201 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 16: Largest change in params was -0.0187 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 17: Largest change in params was -0.017 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 18: Largest change in params was 0.0153 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 19: Largest change in params was -0.0136 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 20: Largest change in params was -0.0121 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 21: Largest change in params was -0.0107 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 22: Largest change in params was -0.0094 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 23: Largest change in params was 0.00828 in the m_probability of company_name, level `All other comparisons`\n",
+      "Iteration 24: Largest change in params was -0.00728 in the m_probability of company_name, level `Exact match on company_name`\n",
+      "Iteration 25: Largest change in params was -0.00641 in the m_probability of company_name, level `Exact match on company_name`\n",
       "\n",
       "EM converged after 25 iterations\n",
       "\n",
@@ -2199,7 +2140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 127,
    "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf",
    "metadata": {
     "tags": []
@@ -2210,23 +2151,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed {\n",
+       "  #altair-viz-e6cc5953cc934635929617b5e9202b2f.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed details,\n",
-       "  #altair-viz-3201e7b556c247e9865c46b1acc2ded5.vega-embed details summary {\n",
+       "  #altair-viz-e6cc5953cc934635929617b5e9202b2f.vega-embed details,\n",
+       "  #altair-viz-e6cc5953cc934635929617b5e9202b2f.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-3201e7b556c247e9865c46b1acc2ded5\"></div>\n",
+       "<div id=\"altair-viz-e6cc5953cc934635929617b5e9202b2f\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-3201e7b556c247e9865c46b1acc2ded5\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-3201e7b556c247e9865c46b1acc2ded5\");\n",
+       "    if (outputDiv.id !== \"altair-viz-e6cc5953cc934635929617b5e9202b2f\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-e6cc5953cc934635929617b5e9202b2f\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2272,14 +2213,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-fc45ce83a28220af2a936ce680a9dadd\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-fc45ce83a28220af2a936ce680a9dadd\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 0.0006890076817709412, \"log2_bayes_factor\": -10.503192311872333, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.001 or one in  1,452.4 records.This is equivalent to a starting match weight of -10.503.\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.3720141993657098, \"u_probability\": 5.346727241521828e-07, \"m_probability_description\": \"Amongst matching record comparisons, 37.2% of records (i.e. one in 2.688) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.347e-05% of records (i.e. one in 1,870,303) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 695779.2731162851, \"log2_bayes_factor\": 19.40827017693912, \"comparison_vector_value\": 4, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 695,779 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.92\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.92\", \"m_probability\": 0.19070155304725356, \"u_probability\": 5.4030085809062685e-06, \"m_probability_description\": \"Amongst matching record comparisons, 19.07% of records (i.e. one in 5.244) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005403% of records (i.e. one in 185,082) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 35295.437753176106, \"log2_bayes_factor\": 15.107194094030717, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.92` then comparison is 35,295 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.88\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.88\", \"m_probability\": 0.08481404998298808, \"u_probability\": 3.987532895387595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 8.481% of records (i.e. one in 11.79) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.003988% of records (i.e. one in 25,078) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2126.9805718993075, \"log2_bayes_factor\": 11.054591140267897, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.88` then comparison is 2,127 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.7\", \"m_probability\": 0.352353073149661, \"u_probability\": 0.028111065707703935, \"m_probability_description\": \"Amongst matching record comparisons, 35.24% of records (i.e. one in 2.838) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.811% of records (i.e. one in 35.57) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 12.534319289542177, \"log2_bayes_factor\": 3.6478117436904105, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.7` then comparison is 12.53 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.00011712445438753835, \"u_probability\": 0.9718431212820371, \"m_probability_description\": \"Amongst matching record comparisons, 0.01171% of records (i.e. one in 8,538) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 97.18% of records (i.e. one in 1.029) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.0001205178611883675, \"log2_bayes_factor\": -13.01846540456049, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 8,298 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.47771795453863003, \"u_probability\": 0.20575506730342918, \"m_probability_description\": \"Amongst matching record comparisons, 47.77% of records (i.e. one in 2.093) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 20.58% of records (i.e. one in 4.86) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.3217797782551535, \"log2_bayes_factor\": 1.2152311384884475, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.322 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.027967056159476814, \"u_probability\": 0.006199367500196429, \"m_probability_description\": \"Amongst matching record comparisons, 2.797% of records (i.e. one in 35.76) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6199% of records (i.e. one in 161) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.511275732337318, \"log2_bayes_factor\": 2.1735354672850598, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.511 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.7\", \"m_probability\": 0.014133426902826658, \"u_probability\": 0.0063969929142549085, \"m_probability_description\": \"Amongst matching record comparisons, 1.413% of records (i.e. one in 70.75) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6397% of records (i.e. one in 156) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.209386049393937, \"log2_bayes_factor\": 1.1436455250240343, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.7` then comparison is 2.209 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4801815623990666, \"u_probability\": 0.7816485722821195, \"m_probability_description\": \"Amongst matching record comparisons, 48.02% of records (i.e. one in 2.083) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 78.16% of records (i.e. one in 1.279) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.614318991202296, \"log2_bayes_factor\": -0.7029401110811061, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.628 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-22, 22]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-22, 22]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-7565426c0d5a635df187bdc25dda8c3f\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-7565426c0d5a635df187bdc25dda8c3f\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 8.207898616494689e-05, \"log2_bayes_factor\": -13.572627563470363, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  12,184.4 records.This is equivalent to a starting match weight of -13.573.\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.7442950075123129, \"u_probability\": 2.986425204234154e-07, \"m_probability_description\": \"Amongst matching record comparisons, 74.43% of records (i.e. one in 1.344) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.986e-05% of records (i.e. one in 3,348,485) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2492260.6682298672, \"log2_bayes_factor\": 21.249023538607176, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 2,492,261 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.255704992487687, \"u_probability\": 0.9999997013574796, \"m_probability_description\": \"Amongst matching record comparisons, 25.57% of records (i.e. one in 3.911) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.25570506885209326, \"log2_bayes_factor\": -1.9674473354212287, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.911 times less likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.5436115300955104, \"u_probability\": 0.21799689508404613, \"m_probability_description\": \"Amongst matching record comparisons, 54.36% of records (i.e. one in 1.84) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 21.8% of records (i.e. one in 4.587) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.4936663886241472, \"log2_bayes_factor\": 1.318268469309102, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.494 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.02862700324454422, \"u_probability\": 0.005758585752972691, \"m_probability_description\": \"Amongst matching record comparisons, 2.863% of records (i.e. one in 34.93) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.5759% of records (i.e. one in 174) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.971186411484177, \"log2_bayes_factor\": 2.3135902030353814, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.971 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4277614666599453, \"u_probability\": 0.7762445191629812, \"m_probability_description\": \"Amongst matching record comparisons, 42.78% of records (i.e. one in 2.338) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 77.62% of records (i.e. one in 1.288) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.5510653616223885, \"log2_bayes_factor\": -0.8597046485119697, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.815 times less likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 115,
+     "execution_count": 127,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2290,7 +2231,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 128,
    "id": "673a4776-1de1-46ce-a411-f7fd1668d54f",
    "metadata": {
     "tags": []
@@ -2301,23 +2242,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed {\n",
+       "  #altair-viz-210545576e8d414fb910138cda2663b7.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed details,\n",
-       "  #altair-viz-7861bebb26e3480992f35df62a1446ee.vega-embed details summary {\n",
+       "  #altair-viz-210545576e8d414fb910138cda2663b7.vega-embed details,\n",
+       "  #altair-viz-210545576e8d414fb910138cda2663b7.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-7861bebb26e3480992f35df62a1446ee\"></div>\n",
+       "<div id=\"altair-viz-210545576e8d414fb910138cda2663b7\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-7861bebb26e3480992f35df62a1446ee\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-7861bebb26e3480992f35df62a1446ee\");\n",
+       "    if (outputDiv.id !== \"altair-viz-210545576e8d414fb910138cda2663b7\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-210545576e8d414fb910138cda2663b7\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2363,14 +2304,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-b8247f1f2757a60a3093f064d0fd8cf0\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-b8247f1f2757a60a3093f064d0fd8cf0\": [{\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.3720141993657098, \"u_probability\": 5.346727241521828e-07, \"m_probability_description\": \"Amongst matching record comparisons, 37.2% of records (i.e. one in 2.688) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.347e-05% of records (i.e. one in 1,870,303) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 695779.2731162851, \"log2_bayes_factor\": 19.40827017693912, \"comparison_vector_value\": 4, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 695,779 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.92\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.92\", \"m_probability\": 0.19070155304725356, \"u_probability\": 5.4030085809062685e-06, \"m_probability_description\": \"Amongst matching record comparisons, 19.07% of records (i.e. one in 5.244) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005403% of records (i.e. one in 185,082) are in the jaro-winkler distance of company_name >= 0.92 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 35295.437753176106, \"log2_bayes_factor\": 15.107194094030717, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.92` then comparison is 35,295 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.88\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.88\", \"m_probability\": 0.08481404998298808, \"u_probability\": 3.987532895387595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 8.481% of records (i.e. one in 11.79) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.003988% of records (i.e. one in 25,078) are in the jaro-winkler distance of company_name >= 0.88 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2126.9805718993075, \"log2_bayes_factor\": 11.054591140267897, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.88` then comparison is 2,127 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of company_name >= 0.7\", \"m_probability\": 0.352353073149661, \"u_probability\": 0.028111065707703935, \"m_probability_description\": \"Amongst matching record comparisons, 35.24% of records (i.e. one in 2.838) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.811% of records (i.e. one in 35.57) are in the jaro-winkler distance of company_name >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 12.534319289542177, \"log2_bayes_factor\": 3.6478117436904105, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name >= 0.7` then comparison is 12.53 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.00011712445438753835, \"u_probability\": 0.9718431212820371, \"m_probability_description\": \"Amongst matching record comparisons, 0.01171% of records (i.e. one in 8,538) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 97.18% of records (i.e. one in 1.029) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.0001205178611883675, \"log2_bayes_factor\": -13.01846540456049, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 4, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 8,298 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.47771795453863003, \"u_probability\": 0.20575506730342918, \"m_probability_description\": \"Amongst matching record comparisons, 47.77% of records (i.e. one in 2.093) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 20.58% of records (i.e. one in 4.86) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.3217797782551535, \"log2_bayes_factor\": 1.2152311384884475, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.322 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.027967056159476814, \"u_probability\": 0.006199367500196429, \"m_probability_description\": \"Amongst matching record comparisons, 2.797% of records (i.e. one in 35.76) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6199% of records (i.e. one in 161) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.511275732337318, \"log2_bayes_factor\": 2.1735354672850598, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.511 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.7\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.7\", \"m_probability\": 0.014133426902826658, \"u_probability\": 0.0063969929142549085, \"m_probability_description\": \"Amongst matching record comparisons, 1.413% of records (i.e. one in 70.75) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6397% of records (i.e. one in 156) are in the jaro-winkler distance of loc_of_incorporation >= 0.7 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.209386049393937, \"log2_bayes_factor\": 1.1436455250240343, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.7` then comparison is 2.209 times more likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4801815623990666, \"u_probability\": 0.7816485722821195, \"m_probability_description\": \"Amongst matching record comparisons, 48.02% of records (i.e. one in 2.083) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 78.16% of records (i.e. one in 1.279) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.614318991202296, \"log2_bayes_factor\": -0.7029401110811061, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.628 times less likely to be a match\", \"probability_two_random_records_match\": 0.0006885332770538962, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-acf8c47146c2570c2985dfacce8871f3\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-acf8c47146c2570c2985dfacce8871f3\": [{\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.7442950075123129, \"u_probability\": 2.986425204234154e-07, \"m_probability_description\": \"Amongst matching record comparisons, 74.43% of records (i.e. one in 1.344) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 2.986e-05% of records (i.e. one in 3,348,485) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2492260.6682298672, \"log2_bayes_factor\": 21.249023538607176, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 2,492,261 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.255704992487687, \"u_probability\": 0.9999997013574796, \"m_probability_description\": \"Amongst matching record comparisons, 25.57% of records (i.e. one in 3.911) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.25570506885209326, \"log2_bayes_factor\": -1.9674473354212287, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.911 times less likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"\\\"loc_of_incorporation_l\\\" = \\\"loc_of_incorporation_r\\\"\", \"label_for_charts\": \"Exact match on loc_of_incorporation\", \"m_probability\": 0.5436115300955104, \"u_probability\": 0.21799689508404613, \"m_probability_description\": \"Amongst matching record comparisons, 54.36% of records (i.e. one in 1.84) are in the exact match on loc_of_incorporation comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 21.8% of records (i.e. one in 4.587) are in the exact match on loc_of_incorporation comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"loc_of_incorporation\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 2.4936663886241472, \"log2_bayes_factor\": 1.318268469309102, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on loc_of_incorporation` then comparison is 2.494 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"jaro_winkler_similarity(\\\"loc_of_incorporation_l\\\", \\\"loc_of_incorporation_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of loc_of_incorporation >= 0.9\", \"m_probability\": 0.02862700324454422, \"u_probability\": 0.005758585752972691, \"m_probability_description\": \"Amongst matching record comparisons, 2.863% of records (i.e. one in 34.93) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.5759% of records (i.e. one in 174) are in the jaro-winkler distance of loc_of_incorporation >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 4.971186411484177, \"log2_bayes_factor\": 2.3135902030353814, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of loc_of_incorporation >= 0.9` then comparison is 4.971 times more likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"loc_of_incorporation\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.4277614666599453, \"u_probability\": 0.7762445191629812, \"m_probability_description\": \"Amongst matching record comparisons, 42.78% of records (i.e. one in 2.338) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 77.62% of records (i.e. one in 1.288) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.5510653616223885, \"log2_bayes_factor\": -0.8597046485119697, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.815 times less likely to be a match\", \"probability_two_random_records_match\": 8.207224975789448e-05, \"comparison_sort_order\": 1}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.HConcatChart(...)"
       ]
      },
-     "execution_count": 116,
+     "execution_count": 128,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2381,7 +2322,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 107,
    "id": "ebf9e326-38f1-4d78-b302-15867cda1009",
    "metadata": {},
    "outputs": [],
@@ -2409,7 +2350,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 129,
    "id": "72ff6575-68e3-4256-8253-85eb2564501f",
    "metadata": {
     "tags": []
@@ -2419,28 +2360,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Blocking time: 0.37 seconds\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d550d84b328c4d3082bd7cf5d03b803b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Predict time: 78.84 seconds\n"
+      "Blocking time: 0.20 seconds\n",
+      "Predict time: 0.12 seconds\n"
      ]
     }
    ],
@@ -2450,7 +2371,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 130,
    "id": "24e14675-11cf-4c46-a592-7733326113d2",
    "metadata": {
     "tags": []
@@ -2462,7 +2383,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 131,
+   "id": "d50332a5-a8dc-444b-be92-b9d29f73763e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_df = preds_df.merge(sec_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_l\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_sec\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "id": "fddbed17-3d71-4c85-95d5-c3d0fd517f9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_df = preds_df.merge(ex21_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_r\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_ex21\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
    "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36",
    "metadata": {
     "tags": []
@@ -2498,10 +2439,7 @@
        "      <th>company_name_l</th>\n",
        "      <th>company_name_r</th>\n",
        "      <th>gamma_company_name</th>\n",
-       "      <th>tf_company_name_l</th>\n",
-       "      <th>tf_company_name_r</th>\n",
        "      <th>bf_company_name</th>\n",
-       "      <th>bf_tf_adj_company_name</th>\n",
        "      <th>loc_of_incorporation_l</th>\n",
        "      <th>loc_of_incorporation_r</th>\n",
        "      <th>gamma_loc_of_incorporation</th>\n",
@@ -2509,376 +2447,2601 @@
        "      <th>tf_loc_of_incorporation_r</th>\n",
        "      <th>bf_loc_of_incorporation</th>\n",
        "      <th>bf_tf_adj_loc_of_incorporation</th>\n",
-       "      <th>report_year_l</th>\n",
-       "      <th>report_year_r</th>\n",
        "      <th>company_name_mphone_l</th>\n",
        "      <th>company_name_mphone_r</th>\n",
+       "      <th>record_id_x</th>\n",
+       "      <th>company_name_sec</th>\n",
+       "      <th>record_id_y</th>\n",
+       "      <th>company_name_ex21</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8180</td>\n",
+       "      <td>159390</td>\n",
+       "      <td>national instruments corporation</td>\n",
+       "      <td>national instruments corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>republic of korea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000234</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NXNL INSTRMNTS</td>\n",
+       "      <td>NXNL INSTRMNTS</td>\n",
+       "      <td>8180</td>\n",
+       "      <td>national instruments corp</td>\n",
+       "      <td>159390</td>\n",
+       "      <td>national instruments (korea) corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>176</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6034</td>\n",
+       "      <td>107265</td>\n",
+       "      <td>afternext healthtech acquisition corporation</td>\n",
+       "      <td>afternext healthtech acquisition corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AFTRNKST HL0TX AKKSXN</td>\n",
+       "      <td>AFTRNKST HL0TX AKKSXN</td>\n",
+       "      <td>6034</td>\n",
+       "      <td>afternext healthtech acquisition corp.</td>\n",
+       "      <td>107265</td>\n",
+       "      <td>afternext healthtech acquisition corp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>178</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>117610</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>puerto rico</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.001548</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>gap inc</td>\n",
+       "      <td>117610</td>\n",
+       "      <td>gap (puerto rico), inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>183</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5811</td>\n",
+       "      <td>170135</td>\n",
+       "      <td>rockley photonics holdings limited</td>\n",
+       "      <td>rockley photonics holdings limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>RKL FTNKS HLTNKS</td>\n",
+       "      <td>RKL FTNKS HLTNKS</td>\n",
+       "      <td>5811</td>\n",
+       "      <td>rockley photonics holdings ltd</td>\n",
+       "      <td>170135</td>\n",
+       "      <td>rockley photonics holdings limited</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>184</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>117608</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>california</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.015978</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>gap inc</td>\n",
+       "      <td>117608</td>\n",
+       "      <td>gap (itm) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>186</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>117605</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>gap incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012191</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>KP</td>\n",
+       "      <td>6799</td>\n",
+       "      <td>gap inc</td>\n",
+       "      <td>117605</td>\n",
+       "      <td>gap (canada) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>412</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1524</td>\n",
+       "      <td>165843</td>\n",
+       "      <td>aircastle limited</td>\n",
+       "      <td>aircastle limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>d0</td>\n",
+       "      <td>ireland</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000150</td>\n",
+       "      <td>0.008315</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ARKSTL</td>\n",
+       "      <td>ARKSTL</td>\n",
+       "      <td>1524</td>\n",
+       "      <td>aircastle ltd</td>\n",
+       "      <td>165843</td>\n",
+       "      <td>aircastle (ireland) limited</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>189</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6753</td>\n",
+       "      <td>115383</td>\n",
+       "      <td>arthur j gallagher and company</td>\n",
+       "      <td>arthur j gallagher and company</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>illinois</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.006115</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AR0R J KLKHR ANT</td>\n",
+       "      <td>AR0R J KLKHR ANT</td>\n",
+       "      <td>6753</td>\n",
+       "      <td>arthur j. gallagher &amp; co.</td>\n",
+       "      <td>115383</td>\n",
+       "      <td>arthur j. gallagher &amp; co</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>193</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6651</td>\n",
+       "      <td>110797</td>\n",
+       "      <td>flowserve corporation</td>\n",
+       "      <td>flowserve corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>mauritius</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.009913</td>\n",
+       "      <td>0.001075</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>FLSRF</td>\n",
+       "      <td>FLSRF</td>\n",
+       "      <td>6651</td>\n",
+       "      <td>flowserve corp</td>\n",
+       "      <td>110797</td>\n",
+       "      <td>flowserve (mauritius) corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>406</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>578</td>\n",
+       "      <td>24844</td>\n",
+       "      <td>united parcel service incorporated</td>\n",
+       "      <td>united parcel service incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>ohio</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.008136</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>UNTT PRSL SRFS</td>\n",
+       "      <td>UNTT PRSL SRFS</td>\n",
+       "      <td>578</td>\n",
+       "      <td>united parcel service inc</td>\n",
+       "      <td>24844</td>\n",
+       "      <td>united parcel service, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>198</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5812</td>\n",
+       "      <td>171905</td>\n",
+       "      <td>nextracker incorporated</td>\n",
+       "      <td>nextracker incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>united states delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.002278</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NKSTRKR</td>\n",
+       "      <td>NKSTRKR</td>\n",
+       "      <td>5812</td>\n",
+       "      <td>nextracker inc.</td>\n",
+       "      <td>171905</td>\n",
+       "      <td>nextracker inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>199</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5843</td>\n",
+       "      <td>51850</td>\n",
+       "      <td>sculptor acquisition corp i</td>\n",
+       "      <td>sculptor acquisition corp i</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>SKLPTR AKKSXN I</td>\n",
+       "      <td>SKLPTR AKKSXN I</td>\n",
+       "      <td>5843</td>\n",
+       "      <td>sculptor acquisition corp i</td>\n",
+       "      <td>51850</td>\n",
+       "      <td>sculptor acquisition corp i</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>174</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7095</td>\n",
+       "      <td>179994</td>\n",
+       "      <td>cintas corporation</td>\n",
+       "      <td>cintas corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>washington</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002996</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>SNTS</td>\n",
+       "      <td>SNTS</td>\n",
+       "      <td>7095</td>\n",
+       "      <td>cintas corp</td>\n",
+       "      <td>179994</td>\n",
+       "      <td>cintas corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>405</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>285</td>\n",
+       "      <td>12641</td>\n",
+       "      <td>onespan incorporated</td>\n",
+       "      <td>onespan incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>usa, state of delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000011</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ONSPN</td>\n",
+       "      <td>ONSPN</td>\n",
+       "      <td>285</td>\n",
+       "      <td>onespan inc.</td>\n",
+       "      <td>12641</td>\n",
+       "      <td>onespan inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>207</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6282</td>\n",
+       "      <td>97173</td>\n",
+       "      <td>mars acquisition corporation</td>\n",
+       "      <td>mars acquisition corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>MRS AKKSXN</td>\n",
+       "      <td>MRS AKKSXN</td>\n",
+       "      <td>6282</td>\n",
+       "      <td>mars acquisition corp.</td>\n",
+       "      <td>97173</td>\n",
+       "      <td>mars acquisition corp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>212</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4834</td>\n",
+       "      <td>97747</td>\n",
+       "      <td>viatris incorporated</td>\n",
+       "      <td>viatris incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>philippines</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.001927</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>FTRS</td>\n",
+       "      <td>FTRS</td>\n",
+       "      <td>4834</td>\n",
+       "      <td>viatris inc</td>\n",
+       "      <td>97747</td>\n",
+       "      <td>viatris, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>397</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1205</td>\n",
+       "      <td>35911</td>\n",
+       "      <td>turning point brands incorporated</td>\n",
+       "      <td>turning point brands incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>ontario, canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000852</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>TRNNK PNT BRNTS</td>\n",
+       "      <td>TRNNK PNT BRNTS</td>\n",
+       "      <td>1205</td>\n",
+       "      <td>turning point brands, inc.</td>\n",
+       "      <td>35911</td>\n",
+       "      <td>turning point brands (canada) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>396</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1171</td>\n",
+       "      <td>35941</td>\n",
+       "      <td>clearpoint neuro incorporated</td>\n",
+       "      <td>clearpoint neuro incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada new brunswick</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000006</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KLRPNT NR</td>\n",
+       "      <td>KLRPNT NR</td>\n",
+       "      <td>1171</td>\n",
+       "      <td>clearpoint neuro, inc.</td>\n",
+       "      <td>35941</td>\n",
+       "      <td>clearpoint neuro (canada) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>393</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1765</td>\n",
+       "      <td>51537</td>\n",
+       "      <td>genpact limited</td>\n",
+       "      <td>genpact limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>d0</td>\n",
+       "      <td>united kingdom</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000150</td>\n",
+       "      <td>0.031521</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>JNPKT</td>\n",
+       "      <td>JNPKT</td>\n",
+       "      <td>1765</td>\n",
+       "      <td>genpact ltd</td>\n",
+       "      <td>51537</td>\n",
+       "      <td>genpact (uk) ltd</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>223</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6181</td>\n",
+       "      <td>106386</td>\n",
+       "      <td>perimeter solutions sa</td>\n",
+       "      <td>perimeter solutions sa</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>n4</td>\n",
+       "      <td>grand of luxembourg</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000017</td>\n",
+       "      <td>0.000011</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>PRMTR SLXNS S</td>\n",
+       "      <td>PRMTR SLXNS S</td>\n",
+       "      <td>6181</td>\n",
+       "      <td>perimeter solutions, sa</td>\n",
+       "      <td>106386</td>\n",
+       "      <td>perimeter solutions sa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>390</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>949</td>\n",
+       "      <td>34324</td>\n",
+       "      <td>ceva incorporated</td>\n",
+       "      <td>ceva incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>SF</td>\n",
+       "      <td>SF</td>\n",
+       "      <td>949</td>\n",
+       "      <td>ceva inc</td>\n",
+       "      <td>34324</td>\n",
+       "      <td>ceva inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>226</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6825</td>\n",
+       "      <td>123476</td>\n",
+       "      <td>harte hanks incorporated</td>\n",
+       "      <td>harte hanks incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>ohio</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.008136</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>HRT HNKS</td>\n",
+       "      <td>HRT HNKS</td>\n",
+       "      <td>6825</td>\n",
+       "      <td>harte hanks inc</td>\n",
+       "      <td>123476</td>\n",
+       "      <td>harte hanks, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>228</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>234</td>\n",
+       "      <td>6600</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>puerto rico</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.007786</td>\n",
+       "      <td>0.001548</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>234</td>\n",
+       "      <td>jones lang lasalle inc</td>\n",
+       "      <td>6600</td>\n",
+       "      <td>jones lang lasalle (puerto rico), inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>229</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>234</td>\n",
+       "      <td>6596</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>philippines</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.007786</td>\n",
+       "      <td>0.001927</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>234</td>\n",
+       "      <td>jones lang lasalle inc</td>\n",
+       "      <td>6596</td>\n",
+       "      <td>jones lang lasalle (philippines), inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>231</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2097</td>\n",
+       "      <td>54939</td>\n",
+       "      <td>optimizerx corporation</td>\n",
+       "      <td>optimizerx corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>michigan</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.007151</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>OPTMSRKS</td>\n",
+       "      <td>OPTMSRKS</td>\n",
+       "      <td>2097</td>\n",
+       "      <td>optimizerx corp</td>\n",
+       "      <td>54939</td>\n",
+       "      <td>optimizerx corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>201</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6176</td>\n",
+       "      <td>166072</td>\n",
+       "      <td>phoenix motor incorporated</td>\n",
+       "      <td>phoenix motor incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>us</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000908</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>FNKS MTR</td>\n",
+       "      <td>FNKS MTR</td>\n",
+       "      <td>6176</td>\n",
+       "      <td>phoenix motor inc.</td>\n",
+       "      <td>166072</td>\n",
+       "      <td>phoenix motor inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>232</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2117</td>\n",
+       "      <td>57288</td>\n",
+       "      <td>transocean limited</td>\n",
+       "      <td>transocean limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>v8</td>\n",
+       "      <td>switzerland</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000033</td>\n",
+       "      <td>0.006421</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>TRNSSN</td>\n",
+       "      <td>TRNSSN</td>\n",
+       "      <td>2117</td>\n",
+       "      <td>transocean ltd.</td>\n",
+       "      <td>57288</td>\n",
+       "      <td>transocean ltd</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>421</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1348</td>\n",
+       "      <td>40725</td>\n",
+       "      <td>lazard group limited liability company</td>\n",
+       "      <td>lazard group limited liability company</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>us</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000908</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>LSRT KRP</td>\n",
+       "      <td>LSRT KRP</td>\n",
+       "      <td>1348</td>\n",
+       "      <td>lazard group llc</td>\n",
+       "      <td>40725</td>\n",
+       "      <td>lazard group llc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>169</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6922</td>\n",
+       "      <td>189462</td>\n",
+       "      <td>analog devices incorporated</td>\n",
+       "      <td>analog devices incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>massachusetts</td>\n",
+       "      <td>united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.004466</td>\n",
+       "      <td>0.012146</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ANLK TFSS</td>\n",
+       "      <td>ANLK TFSS</td>\n",
+       "      <td>6922</td>\n",
+       "      <td>analog devices inc</td>\n",
+       "      <td>189462</td>\n",
+       "      <td>analog devices, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>115</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2485</td>\n",
+       "      <td>167379</td>\n",
+       "      <td>ameriguard security services incorporated</td>\n",
+       "      <td>ameriguard security services incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>california</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.015978</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AMRKRT SKRT SRFSS</td>\n",
+       "      <td>AMRKRT SKRT SRFSS</td>\n",
+       "      <td>2485</td>\n",
+       "      <td>ameriguard security services, inc.</td>\n",
+       "      <td>167379</td>\n",
+       "      <td>ameriguard security services, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>116</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2486</td>\n",
+       "      <td>167379</td>\n",
+       "      <td>ameriguard security services incorporated</td>\n",
+       "      <td>ameriguard security services incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>california</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.015978</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AMRKRT SKRT SRFSS</td>\n",
+       "      <td>AMRKRT SKRT SRFSS</td>\n",
+       "      <td>2486</td>\n",
+       "      <td>ameriguard security services, inc.</td>\n",
+       "      <td>167379</td>\n",
+       "      <td>ameriguard security services, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>120</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4683</td>\n",
+       "      <td>95837</td>\n",
+       "      <td>advantage solutions incorporated</td>\n",
+       "      <td>advantage solutions incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012191</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ATFNTJ SLXNS</td>\n",
+       "      <td>ATFNTJ SLXNS</td>\n",
+       "      <td>4683</td>\n",
+       "      <td>advantage solutions inc.</td>\n",
+       "      <td>95837</td>\n",
+       "      <td>advantage solutions inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>445</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>926</td>\n",
+       "      <td>165871</td>\n",
+       "      <td>commvault systems incorporated</td>\n",
+       "      <td>commvault systems incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>ontario, canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000852</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KMFLT SSTMS</td>\n",
+       "      <td>KMFLT SSTMS</td>\n",
+       "      <td>926</td>\n",
+       "      <td>commvault systems inc</td>\n",
+       "      <td>165871</td>\n",
+       "      <td>commvault systems (canada) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>124</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4148</td>\n",
+       "      <td>90738</td>\n",
+       "      <td>firstsun capital bancorp</td>\n",
+       "      <td>firstsun capital bancorp</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>new mexico</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000652</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>FRSTSN KPTL BNKRP</td>\n",
+       "      <td>FRSTSN KPTL BNKRP</td>\n",
+       "      <td>4148</td>\n",
+       "      <td>firstsun capital bancorp</td>\n",
+       "      <td>90738</td>\n",
+       "      <td>firstsun capital bancorp</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>126</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5544</td>\n",
+       "      <td>26048</td>\n",
+       "      <td>taboola com limited</td>\n",
+       "      <td>taboola com limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>l3</td>\n",
+       "      <td>israel</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.003057</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>TBL KM</td>\n",
+       "      <td>TBL KM</td>\n",
+       "      <td>5544</td>\n",
+       "      <td>taboola.com ltd.</td>\n",
+       "      <td>26048</td>\n",
+       "      <td>taboola.com ltd</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>443</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>96</td>\n",
+       "      <td>henry schein incorporated</td>\n",
+       "      <td>henry schein incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>pennsylvania</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.007919</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>HNR SXN</td>\n",
+       "      <td>HNR SXN</td>\n",
+       "      <td>2</td>\n",
+       "      <td>henry schein inc</td>\n",
+       "      <td>96</td>\n",
+       "      <td>henry schein (lancaster, pa) inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>132</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6668</td>\n",
+       "      <td>117995</td>\n",
+       "      <td>tomi environmental solutions incorporated</td>\n",
+       "      <td>tomi environmental solutions incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>florida</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014691</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>TM ENFRNMNTL SLXNS</td>\n",
+       "      <td>TM ENFRNMNTL SLXNS</td>\n",
+       "      <td>6668</td>\n",
+       "      <td>tomi environmental solutions, inc.</td>\n",
+       "      <td>117995</td>\n",
+       "      <td>tomi environmental solutions, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>136</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6148</td>\n",
+       "      <td>107455</td>\n",
+       "      <td>esab corporation</td>\n",
+       "      <td>esab corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012146</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ESB</td>\n",
+       "      <td>ESB</td>\n",
+       "      <td>6148</td>\n",
+       "      <td>esab corp</td>\n",
+       "      <td>107455</td>\n",
+       "      <td>esab corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>137</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6958</td>\n",
+       "      <td>104521</td>\n",
+       "      <td>apache corporation</td>\n",
+       "      <td>apache corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>new jersey</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.006143</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>APX</td>\n",
+       "      <td>APX</td>\n",
+       "      <td>6958</td>\n",
+       "      <td>apache corp</td>\n",
+       "      <td>104521</td>\n",
+       "      <td>apache corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>138</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7011</td>\n",
+       "      <td>121758</td>\n",
+       "      <td>ncr corporation</td>\n",
+       "      <td>ncr corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>new zealand</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.007786</td>\n",
+       "      <td>0.002590</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NKR</td>\n",
+       "      <td>NKR</td>\n",
+       "      <td>7011</td>\n",
+       "      <td>ncr corp</td>\n",
+       "      <td>121758</td>\n",
+       "      <td>ncr (nz) corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>423</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>77</td>\n",
+       "      <td>165059</td>\n",
+       "      <td>jakks pacific incorporated</td>\n",
+       "      <td>jakks pacific incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012191</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>JKS PSFK</td>\n",
+       "      <td>JKS PSFK</td>\n",
+       "      <td>77</td>\n",
+       "      <td>jakks pacific inc</td>\n",
+       "      <td>165059</td>\n",
+       "      <td>jakks pacific (canada), inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>139</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4902</td>\n",
+       "      <td>170051</td>\n",
+       "      <td>gan limited</td>\n",
+       "      <td>gan limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>d0</td>\n",
+       "      <td>england and wales</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000150</td>\n",
+       "      <td>0.003536</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KN</td>\n",
+       "      <td>KN</td>\n",
+       "      <td>4902</td>\n",
+       "      <td>gan ltd</td>\n",
+       "      <td>170051</td>\n",
+       "      <td>gan (uk) limited</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>141</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6613</td>\n",
+       "      <td>108716</td>\n",
+       "      <td>cts corporation</td>\n",
+       "      <td>cts corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>indiana</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.004060</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KTS</td>\n",
+       "      <td>KTS</td>\n",
+       "      <td>6613</td>\n",
+       "      <td>cts corp</td>\n",
+       "      <td>108716</td>\n",
+       "      <td>cts corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>437</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>738</td>\n",
+       "      <td>29776</td>\n",
+       "      <td>garmin limited</td>\n",
+       "      <td>garmin limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>v8</td>\n",
+       "      <td>thailand</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000033</td>\n",
+       "      <td>0.002378</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KRMN</td>\n",
+       "      <td>KRMN</td>\n",
+       "      <td>738</td>\n",
+       "      <td>garmin ltd</td>\n",
+       "      <td>29776</td>\n",
+       "      <td>garmin (thailand) ltd</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>435</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>277</td>\n",
+       "      <td>9849</td>\n",
+       "      <td>c h robinson worldwide incorporated</td>\n",
+       "      <td>c h robinson worldwide incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012146</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>K H RBNSN WRLTWT</td>\n",
+       "      <td>K H RBNSN WRLTWT</td>\n",
+       "      <td>277</td>\n",
+       "      <td>c. h. robinson worldwide, inc.</td>\n",
+       "      <td>9849</td>\n",
+       "      <td>c.h. robinson worldwide, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>146</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6763</td>\n",
+       "      <td>176423</td>\n",
+       "      <td>richardson electronics limited</td>\n",
+       "      <td>richardson electronics limited</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>thailand</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.002378</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>RXRTSN ELKTRNKS</td>\n",
+       "      <td>RXRTSN ELKTRNKS</td>\n",
+       "      <td>6763</td>\n",
+       "      <td>richardson electronics, ltd.</td>\n",
+       "      <td>176423</td>\n",
+       "      <td>richardson electronics (thailand) limited</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>149</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>4875</td>\n",
+       "      <td>98755</td>\n",
+       "      <td>api group corporation</td>\n",
+       "      <td>api group corporation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>d8</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000078</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AP KRP</td>\n",
+       "      <td>AP KRP</td>\n",
+       "      <td>4875</td>\n",
+       "      <td>api group corp</td>\n",
+       "      <td>98755</td>\n",
+       "      <td>api group corporation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>432</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2310</td>\n",
+       "      <td>167475</td>\n",
+       "      <td>thermon group holdings incorporated</td>\n",
+       "      <td>thermon group holdings incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware, united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.002139</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0RMN KRP HLTNKS</td>\n",
+       "      <td>0RMN KRP HLTNKS</td>\n",
+       "      <td>2310</td>\n",
+       "      <td>thermon group holdings, inc.</td>\n",
+       "      <td>167475</td>\n",
+       "      <td>thermon group holdings, inc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>156</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>6677</td>\n",
+       "      <td>118432</td>\n",
+       "      <td>aon public limited company</td>\n",
+       "      <td>aon public limited company</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>l2</td>\n",
+       "      <td>ireland</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000111</td>\n",
+       "      <td>0.008315</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>AN</td>\n",
+       "      <td>AN</td>\n",
+       "      <td>6677</td>\n",
+       "      <td>aon plc</td>\n",
+       "      <td>118432</td>\n",
+       "      <td>aon plc</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>158</th>\n",
+       "      <td>6.816691</td>\n",
+       "      <td>0.991207</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5955</td>\n",
+       "      <td>80272</td>\n",
+       "      <td>minority equality opportunities acquisition in...</td>\n",
+       "      <td>minority equality opportunities acquisition in...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.492261e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware, united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.002139</td>\n",
+       "      <td>0.551065</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>MNRT EKLT OPRTNTS AKKSXN</td>\n",
+       "      <td>MNRT EKLT OPRTNTS AKKSXN</td>\n",
+       "      <td>5955</td>\n",
+       "      <td>minority equality opportunities acquisition inc.</td>\n",
+       "      <td>80272</td>\n",
+       "      <td>minority equality opportunities acquisition inc</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                                     company_name_l                                     company_name_r  gamma_company_name  bf_company_name loc_of_incorporation_l   loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation     company_name_mphone_l     company_name_mphone_r  record_id_x                                  company_name_sec  record_id_y                                company_name_ex21\n",
+       "0        6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         8180       159390                   national instruments corporation                   national instruments corporation                   1     2.492261e+06               delaware        republic of korea                           0                   0.372842                   0.000234                 0.551065                             1.0            NXNL INSTRMNTS            NXNL INSTRMNTS         8180                         national instruments corp       159390         national instruments (korea) corporation\n",
+       "176      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6034       107265       afternext healthtech acquisition corporation       afternext healthtech acquisition corporation                   1     2.492261e+06                     e9           cayman islands                           0                   0.001069                   0.015387                 0.551065                             1.0     AFTRNKST HL0TX AKKSXN     AFTRNKST HL0TX AKKSXN         6034            afternext healthtech acquisition corp.       107265            afternext healthtech acquisition corp\n",
+       "178      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6799       117610                                   gap incorporated                                   gap incorporated                   1     2.492261e+06               delaware              puerto rico                           0                   0.372842                   0.001548                 0.551065                             1.0                        KP                        KP         6799                                           gap inc       117610                           gap (puerto rico), inc\n",
+       "183      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         5811       170135                 rockley photonics holdings limited                 rockley photonics holdings limited                   1     2.492261e+06                     e9           cayman islands                           0                   0.001069                   0.015387                 0.551065                             1.0          RKL FTNKS HLTNKS          RKL FTNKS HLTNKS         5811                    rockley photonics holdings ltd       170135               rockley photonics holdings limited\n",
+       "184      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6799       117608                                   gap incorporated                                   gap incorporated                   1     2.492261e+06               delaware               california                           0                   0.372842                   0.015978                 0.551065                             1.0                        KP                        KP         6799                                           gap inc       117608                                    gap (itm) inc\n",
+       "186      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6799       117605                                   gap incorporated                                   gap incorporated                   1     2.492261e+06               delaware                   canada                           0                   0.372842                   0.012191                 0.551065                             1.0                        KP                        KP         6799                                           gap inc       117605                                 gap (canada) inc\n",
+       "412      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         1524       165843                                  aircastle limited                                  aircastle limited                   1     2.492261e+06                     d0                  ireland                           0                   0.000150                   0.008315                 0.551065                             1.0                    ARKSTL                    ARKSTL         1524                                     aircastle ltd       165843                      aircastle (ireland) limited\n",
+       "189      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6753       115383                     arthur j gallagher and company                     arthur j gallagher and company                   1     2.492261e+06               illinois                 delaware                           0                   0.006115                   0.372842                 0.551065                             1.0          AR0R J KLKHR ANT          AR0R J KLKHR ANT         6753                         arthur j. gallagher & co.       115383                         arthur j. gallagher & co\n",
+       "193      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6651       110797                              flowserve corporation                              flowserve corporation                   1     2.492261e+06               new york                mauritius                           0                   0.009913                   0.001075                 0.551065                             1.0                     FLSRF                     FLSRF         6651                                    flowserve corp       110797                flowserve (mauritius) corporation\n",
+       "406      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          578        24844                 united parcel service incorporated                 united parcel service incorporated                   1     2.492261e+06               delaware                     ohio                           0                   0.372842                   0.008136                 0.551065                             1.0            UNTT PRSL SRFS            UNTT PRSL SRFS          578                         united parcel service inc        24844                       united parcel service, inc\n",
+       "198      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         5812       171905                            nextracker incorporated                            nextracker incorporated                   1     2.492261e+06               delaware   united states delaware                           0                   0.372842                   0.002278                 0.551065                             1.0                   NKSTRKR                   NKSTRKR         5812                                   nextracker inc.       171905                                   nextracker inc\n",
+       "199      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         5843        51850                        sculptor acquisition corp i                        sculptor acquisition corp i                   1     2.492261e+06                     e9           cayman islands                           0                   0.001069                   0.015387                 0.551065                             1.0           SKLPTR AKKSXN I           SKLPTR AKKSXN I         5843                       sculptor acquisition corp i        51850                      sculptor acquisition corp i\n",
+       "174      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         7095       179994                                 cintas corporation                                 cintas corporation                   1     2.492261e+06             washington                   nevada                           0                   0.002996                   0.014652                 0.551065                             1.0                      SNTS                      SNTS         7095                                       cintas corp       179994                               cintas corporation\n",
+       "405      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          285        12641                               onespan incorporated                               onespan incorporated                   1     2.492261e+06               delaware   usa, state of delaware                           0                   0.372842                   0.000011                 0.551065                             1.0                     ONSPN                     ONSPN          285                                      onespan inc.        12641                                      onespan inc\n",
+       "207      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6282        97173                       mars acquisition corporation                       mars acquisition corporation                   1     2.492261e+06                     e9                 delaware                           0                   0.001069                   0.372842                 0.551065                             1.0                MRS AKKSXN                MRS AKKSXN         6282                            mars acquisition corp.        97173                            mars acquisition corp\n",
+       "212      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         4834        97747                               viatris incorporated                               viatris incorporated                   1     2.492261e+06               delaware              philippines                           0                   0.372842                   0.001927                 0.551065                             1.0                      FTRS                      FTRS         4834                                       viatris inc        97747                                     viatris, inc\n",
+       "397      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         1205        35911                  turning point brands incorporated                  turning point brands incorporated                   1     2.492261e+06               delaware          ontario, canada                           0                   0.372842                   0.000852                 0.551065                             1.0           TRNNK PNT BRNTS           TRNNK PNT BRNTS         1205                        turning point brands, inc.        35911                turning point brands (canada) inc\n",
+       "396      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         1171        35941                      clearpoint neuro incorporated                      clearpoint neuro incorporated                   1     2.492261e+06               delaware     canada new brunswick                           0                   0.372842                   0.000006                 0.551065                             1.0                 KLRPNT NR                 KLRPNT NR         1171                            clearpoint neuro, inc.        35941                    clearpoint neuro (canada) inc\n",
+       "393      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         1765        51537                                    genpact limited                                    genpact limited                   1     2.492261e+06                     d0           united kingdom                           0                   0.000150                   0.031521                 0.551065                             1.0                     JNPKT                     JNPKT         1765                                       genpact ltd        51537                                 genpact (uk) ltd\n",
+       "223      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6181       106386                             perimeter solutions sa                             perimeter solutions sa                   1     2.492261e+06                     n4      grand of luxembourg                           0                   0.000017                   0.000011                 0.551065                             1.0             PRMTR SLXNS S             PRMTR SLXNS S         6181                           perimeter solutions, sa       106386                           perimeter solutions sa\n",
+       "390      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          949        34324                                  ceva incorporated                                  ceva incorporated                   1     2.492261e+06               delaware           cayman islands                           0                   0.372842                   0.015387                 0.551065                             1.0                        SF                        SF          949                                          ceva inc        34324                                         ceva inc\n",
+       "226      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6825       123476                           harte hanks incorporated                           harte hanks incorporated                   1     2.492261e+06               delaware                     ohio                           0                   0.372842                   0.008136                 0.551065                             1.0                  HRT HNKS                  HRT HNKS         6825                                   harte hanks inc       123476                                 harte hanks, inc\n",
+       "228      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          234         6600                    jones lang lasalle incorporated                    jones lang lasalle incorporated                   1     2.492261e+06               maryland              puerto rico                           0                   0.007786                   0.001548                 0.551065                             1.0               JNS LNK LSL               JNS LNK LSL          234                            jones lang lasalle inc         6600            jones lang lasalle (puerto rico), inc\n",
+       "229      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          234         6596                    jones lang lasalle incorporated                    jones lang lasalle incorporated                   1     2.492261e+06               maryland              philippines                           0                   0.007786                   0.001927                 0.551065                             1.0               JNS LNK LSL               JNS LNK LSL          234                            jones lang lasalle inc         6596            jones lang lasalle (philippines), inc\n",
+       "231      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         2097        54939                             optimizerx corporation                             optimizerx corporation                   1     2.492261e+06                 nevada                 michigan                           0                   0.014652                   0.007151                 0.551065                             1.0                  OPTMSRKS                  OPTMSRKS         2097                                   optimizerx corp        54939                           optimizerx corporation\n",
+       "201      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6176       166072                         phoenix motor incorporated                         phoenix motor incorporated                   1     2.492261e+06               delaware                       us                           0                   0.372842                   0.000908                 0.551065                             1.0                  FNKS MTR                  FNKS MTR         6176                                phoenix motor inc.       166072                                phoenix motor inc\n",
+       "232      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         2117        57288                                 transocean limited                                 transocean limited                   1     2.492261e+06                     v8              switzerland                           0                   0.000033                   0.006421                 0.551065                             1.0                    TRNSSN                    TRNSSN         2117                                   transocean ltd.        57288                                   transocean ltd\n",
+       "421      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         1348        40725             lazard group limited liability company             lazard group limited liability company                   1     2.492261e+06               delaware                       us                           0                   0.372842                   0.000908                 0.551065                             1.0                  LSRT KRP                  LSRT KRP         1348                                  lazard group llc        40725                                 lazard group llc\n",
+       "169      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6922       189462                        analog devices incorporated                        analog devices incorporated                   1     2.492261e+06          massachusetts            united states                           0                   0.004466                   0.012146                 0.551065                             1.0                 ANLK TFSS                 ANLK TFSS         6922                                analog devices inc       189462                              analog devices, inc\n",
+       "115      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         2485       167379          ameriguard security services incorporated          ameriguard security services incorporated                   1     2.492261e+06                 nevada               california                           0                   0.014652                   0.015978                 0.551065                             1.0         AMRKRT SKRT SRFSS         AMRKRT SKRT SRFSS         2485                ameriguard security services, inc.       167379                ameriguard security services, inc\n",
+       "116      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         2486       167379          ameriguard security services incorporated          ameriguard security services incorporated                   1     2.492261e+06                 nevada               california                           0                   0.014652                   0.015978                 0.551065                             1.0         AMRKRT SKRT SRFSS         AMRKRT SKRT SRFSS         2486                ameriguard security services, inc.       167379                ameriguard security services, inc\n",
+       "120      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         4683        95837                   advantage solutions incorporated                   advantage solutions incorporated                   1     2.492261e+06               delaware                   canada                           0                   0.372842                   0.012191                 0.551065                             1.0              ATFNTJ SLXNS              ATFNTJ SLXNS         4683                          advantage solutions inc.        95837                          advantage solutions inc\n",
+       "445      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          926       165871                     commvault systems incorporated                     commvault systems incorporated                   1     2.492261e+06               delaware          ontario, canada                           0                   0.372842                   0.000852                 0.551065                             1.0               KMFLT SSTMS               KMFLT SSTMS          926                             commvault systems inc       165871                   commvault systems (canada) inc\n",
+       "124      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         4148        90738                           firstsun capital bancorp                           firstsun capital bancorp                   1     2.492261e+06               delaware               new mexico                           0                   0.372842                   0.000652                 0.551065                             1.0         FRSTSN KPTL BNKRP         FRSTSN KPTL BNKRP         4148                          firstsun capital bancorp        90738                         firstsun capital bancorp\n",
+       "126      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         5544        26048                                taboola com limited                                taboola com limited                   1     2.492261e+06                     l3                   israel                           0                   0.000061                   0.003057                 0.551065                             1.0                    TBL KM                    TBL KM         5544                                  taboola.com ltd.        26048                                  taboola.com ltd\n",
+       "443      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1            2           96                          henry schein incorporated                          henry schein incorporated                   1     2.492261e+06               delaware             pennsylvania                           0                   0.372842                   0.007919                 0.551065                             1.0                   HNR SXN                   HNR SXN            2                                  henry schein inc           96                 henry schein (lancaster, pa) inc\n",
+       "132      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6668       117995          tomi environmental solutions incorporated          tomi environmental solutions incorporated                   1     2.492261e+06                florida                   nevada                           0                   0.014691                   0.014652                 0.551065                             1.0        TM ENFRNMNTL SLXNS        TM ENFRNMNTL SLXNS         6668                tomi environmental solutions, inc.       117995                tomi environmental solutions, inc\n",
+       "136      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6148       107455                                   esab corporation                                   esab corporation                   1     2.492261e+06               delaware            united states                           0                   0.372842                   0.012146                 0.551065                             1.0                       ESB                       ESB         6148                                         esab corp       107455                                 esab corporation\n",
+       "137      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6958       104521                                 apache corporation                                 apache corporation                   1     2.492261e+06               delaware               new jersey                           0                   0.372842                   0.006143                 0.551065                             1.0                       APX                       APX         6958                                       apache corp       104521                               apache corporation\n",
+       "138      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         7011       121758                                    ncr corporation                                    ncr corporation                   1     2.492261e+06               maryland              new zealand                           0                   0.007786                   0.002590                 0.551065                             1.0                       NKR                       NKR         7011                                          ncr corp       121758                             ncr (nz) corporation\n",
+       "423      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1           77       165059                         jakks pacific incorporated                         jakks pacific incorporated                   1     2.492261e+06               delaware                   canada                           0                   0.372842                   0.012191                 0.551065                             1.0                  JKS PSFK                  JKS PSFK           77                                 jakks pacific inc       165059                      jakks pacific (canada), inc\n",
+       "139      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         4902       170051                                        gan limited                                        gan limited                   1     2.492261e+06                     d0        england and wales                           0                   0.000150                   0.003536                 0.551065                             1.0                        KN                        KN         4902                                           gan ltd       170051                                 gan (uk) limited\n",
+       "141      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6613       108716                                    cts corporation                                    cts corporation                   1     2.492261e+06                indiana                 delaware                           0                   0.004060                   0.372842                 0.551065                             1.0                       KTS                       KTS         6613                                          cts corp       108716                                  cts corporation\n",
+       "437      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          738        29776                                     garmin limited                                     garmin limited                   1     2.492261e+06                     v8                 thailand                           0                   0.000033                   0.002378                 0.551065                             1.0                      KRMN                      KRMN          738                                        garmin ltd        29776                            garmin (thailand) ltd\n",
+       "435      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1          277         9849                c h robinson worldwide incorporated                c h robinson worldwide incorporated                   1     2.492261e+06               delaware            united states                           0                   0.372842                   0.012146                 0.551065                             1.0          K H RBNSN WRLTWT          K H RBNSN WRLTWT          277                    c. h. robinson worldwide, inc.         9849                     c.h. robinson worldwide, inc\n",
+       "146      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6763       176423                     richardson electronics limited                     richardson electronics limited                   1     2.492261e+06               delaware                 thailand                           0                   0.372842                   0.002378                 0.551065                             1.0           RXRTSN ELKTRNKS           RXRTSN ELKTRNKS         6763                      richardson electronics, ltd.       176423        richardson electronics (thailand) limited\n",
+       "149      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         4875        98755                              api group corporation                              api group corporation                   1     2.492261e+06                     d8                 delaware                           0                   0.000078                   0.372842                 0.551065                             1.0                    AP KRP                    AP KRP         4875                                    api group corp        98755                            api group corporation\n",
+       "432      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         2310       167475                thermon group holdings incorporated                thermon group holdings incorporated                   1     2.492261e+06               delaware  delaware, united states                           0                   0.372842                   0.002139                 0.551065                             1.0           0RMN KRP HLTNKS           0RMN KRP HLTNKS         2310                      thermon group holdings, inc.       167475                      thermon group holdings, inc\n",
+       "156      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         6677       118432                         aon public limited company                         aon public limited company                   1     2.492261e+06                     l2                  ireland                           0                   0.000111                   0.008315                 0.551065                             1.0                        AN                        AN         6677                                           aon plc       118432                                          aon plc\n",
+       "158      6.816691           0.991207  __splink__input_table_0  __splink__input_table_1         5955        80272  minority equality opportunities acquisition in...  minority equality opportunities acquisition in...                   1     2.492261e+06               delaware  delaware, united states                           0                   0.372842                   0.002139                 0.551065                             1.0  MNRT EKLT OPRTNTS AKKSXN  MNRT EKLT OPRTNTS AKKSXN         5955  minority equality opportunities acquisition inc.        80272  minority equality opportunities acquisition inc"
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df.sort_values(by=\"match_probability\").iloc[0:50]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "59cc74aa-674b-4c89-95d6-181d0f7c162a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_l</th>\n",
+       "      <th>company_name_r</th>\n",
+       "      <th>gamma_company_name</th>\n",
+       "      <th>bf_company_name</th>\n",
+       "      <th>loc_of_incorporation_l</th>\n",
+       "      <th>loc_of_incorporation_r</th>\n",
+       "      <th>gamma_loc_of_incorporation</th>\n",
+       "      <th>tf_loc_of_incorporation_l</th>\n",
+       "      <th>tf_loc_of_incorporation_r</th>\n",
+       "      <th>bf_loc_of_incorporation</th>\n",
+       "      <th>bf_tf_adj_loc_of_incorporation</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8180</td>\n",
+       "      <td>159390</td>\n",
+       "      <td>national instruments corporation</td>\n",
+       "      <td>national instruments corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>republic of korea</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000234</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NXNL INSTRMNTS</td>\n",
+       "      <td>NXNL INSTRMNTS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7912</td>\n",
+       "      <td>154757</td>\n",
+       "      <td>enbridge incorporated</td>\n",
+       "      <td>enbridge incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>a0</td>\n",
+       "      <td>alberta</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000033</td>\n",
+       "      <td>0.000880</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ENBRJ</td>\n",
+       "      <td>ENBRJ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7557</td>\n",
+       "      <td>140921</td>\n",
+       "      <td>spectrum pharmaceuticals incorporated</td>\n",
+       "      <td>spectrum pharmaceuticals incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>SPKTRM FRMSTKLS</td>\n",
+       "      <td>SPKTRM FRMSTKLS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8057</td>\n",
+       "      <td>152329</td>\n",
+       "      <td>american eagle outfitters incorporated</td>\n",
+       "      <td>american eagle outfitters incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>AMRKN EKL OTFTRS</td>\n",
+       "      <td>AMRKN EKL OTFTRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>14.126362</td>\n",
+       "      <td>0.999944</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7315</td>\n",
+       "      <td>28974</td>\n",
+       "      <td>pruco life insurance company</td>\n",
+       "      <td>pruco life insurance company</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>arizona</td>\n",
+       "      <td>arizona</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.004388</td>\n",
+       "      <td>0.004388</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>49.368830</td>\n",
+       "      <td>PRK LF INSRNS</td>\n",
+       "      <td>PRK LF INSRNS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7419</td>\n",
+       "      <td>142779</td>\n",
+       "      <td>national presto industries incorporated</td>\n",
+       "      <td>national presto industries incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>wisconsin</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.004110</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NXNL PRST INTSTRS</td>\n",
+       "      <td>NXNL PRST INTSTRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7387</td>\n",
+       "      <td>142016</td>\n",
+       "      <td>national bankshares incorporated</td>\n",
+       "      <td>national bankshares incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>virginia</td>\n",
+       "      <td>commonwealth virginia</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.006276</td>\n",
+       "      <td>0.000022</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NXNL BNKXRS</td>\n",
+       "      <td>NXNL BNKXRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>13.610142</td>\n",
+       "      <td>0.999920</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7387</td>\n",
+       "      <td>127697</td>\n",
+       "      <td>national bankshares incorporated</td>\n",
+       "      <td>national bankshares incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>virginia</td>\n",
+       "      <td>virginia</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.006276</td>\n",
+       "      <td>0.006276</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>34.518756</td>\n",
+       "      <td>NXNL BNKXRS</td>\n",
+       "      <td>NXNL BNKXRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8258</td>\n",
+       "      <td>162906</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>12.101855</td>\n",
+       "      <td>0.999773</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>7428</td>\n",
+       "      <td>60197</td>\n",
+       "      <td>general motors financial company incorporated</td>\n",
+       "      <td>general motors financial company incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>texas</td>\n",
+       "      <td>texas</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.017854</td>\n",
+       "      <td>0.017854</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>12.134323</td>\n",
+       "      <td>JNRL MTRS FNNXL</td>\n",
+       "      <td>JNRL MTRS FNNXL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8258</td>\n",
+       "      <td>163501</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>mexico</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.011205</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>5498</td>\n",
+       "      <td>52885</td>\n",
+       "      <td>apollo strategic growth capital ii</td>\n",
+       "      <td>apollo strategic growth capital ii</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>cayman islands</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.015387</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>APL STRTJK KR0 KPTL</td>\n",
+       "      <td>APL STRTJK KR0 KPTL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8258</td>\n",
+       "      <td>162892</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012191</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>8258</td>\n",
+       "      <td>162847</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>thermo fisher scientific incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>russia</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.001108</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "      <td>0RM FXR SSNTFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>498</td>\n",
+       "      <td>18301</td>\n",
+       "      <td>intellinetics incorporated</td>\n",
+       "      <td>intellinetics incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>ohio</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.008136</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>INTLNTKS</td>\n",
+       "      <td>INTLNTKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1533</td>\n",
+       "      <td>165897</td>\n",
+       "      <td>high sierra technologies incorporated</td>\n",
+       "      <td>high sierra technologies incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>colorado</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.004817</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>H SR TXNLJS</td>\n",
+       "      <td>H SR TXNLJS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>13.991858</td>\n",
+       "      <td>0.999939</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2127</td>\n",
+       "      <td>61213</td>\n",
+       "      <td>lnpr group incorporated</td>\n",
+       "      <td>lnpr group incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>colorado</td>\n",
+       "      <td>colorado</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.004817</td>\n",
+       "      <td>0.004817</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>44.974148</td>\n",
+       "      <td>LNPR KRP</td>\n",
+       "      <td>LNPR KRP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>93</td>\n",
+       "      <td>1969</td>\n",
+       "      <td>norwood financial corporation</td>\n",
+       "      <td>norwood financial corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>pennsylvania</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.007919</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NRWT FNNXL</td>\n",
+       "      <td>NRWT FNNXL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>151</td>\n",
+       "      <td>2257</td>\n",
+       "      <td>nov incorporated</td>\n",
+       "      <td>nov incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>mauritius</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.001075</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>NF</td>\n",
+       "      <td>NF</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>280</td>\n",
+       "      <td>10975</td>\n",
+       "      <td>juniper networks incorporated</td>\n",
+       "      <td>juniper networks incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>california, usa</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000234</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>JNPR NTWRKS</td>\n",
+       "      <td>JNPR NTWRKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>3.252392</td>\n",
+       "      <td>0.905028</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1399</td>\n",
+       "      <td>157790</td>\n",
+       "      <td>logiq incorporated</td>\n",
+       "      <td>logiq3 incorporated</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2.087284e+05</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012191</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>LJK</td>\n",
+       "      <td>LJK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1720</td>\n",
+       "      <td>166283</td>\n",
+       "      <td>edgio incorporated</td>\n",
+       "      <td>edgio incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>EJ</td>\n",
+       "      <td>EJ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>184709</td>\n",
+       "      <td>arem pacific corporation</td>\n",
+       "      <td>arem pacific corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>arizona</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.004388</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ARM PSFK</td>\n",
+       "      <td>ARM PSFK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>756</td>\n",
+       "      <td>26596</td>\n",
+       "      <td>ensign group incorporated</td>\n",
+       "      <td>ensign group incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>None</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ENSKN KRP</td>\n",
+       "      <td>ENSKN KRP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1104</td>\n",
+       "      <td>24668</td>\n",
+       "      <td>cco holdings limited liability company</td>\n",
+       "      <td>cco holdings limited liability company</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>None</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KK HLTNKS</td>\n",
+       "      <td>KK HLTNKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>321</td>\n",
+       "      <td>11011</td>\n",
+       "      <td>pc connection incorporated</td>\n",
+       "      <td>pc connection incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>KNKXN</td>\n",
+       "      <td>KNKXN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>477</td>\n",
+       "      <td>14483</td>\n",
+       "      <td>polarityte incorporated</td>\n",
+       "      <td>polarityte incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>PLRTT</td>\n",
+       "      <td>PLRTT</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>810</td>\n",
+       "      <td>25991</td>\n",
+       "      <td>atlas air worldwide holdings incorporated</td>\n",
+       "      <td>atlas air worldwide holdings incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>ATLS AR WRLTWT HLTNKS</td>\n",
+       "      <td>ATLS AR WRLTWT HLTNKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1003</td>\n",
+       "      <td>166010</td>\n",
+       "      <td>spi energy co limited</td>\n",
+       "      <td>spi energy co limited</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>e9</td>\n",
+       "      <td>cayman</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001069</td>\n",
+       "      <td>0.000345</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>SP ENRJ</td>\n",
+       "      <td>SP ENRJ</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1012</td>\n",
+       "      <td>165926</td>\n",
+       "      <td>bimi international medical incorporated</td>\n",
+       "      <td>bimi international medical incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>BM INTRNXNL MTKL</td>\n",
+       "      <td>BM INTRNXNL MTKL</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1868</td>\n",
+       "      <td>51876</td>\n",
+       "      <td>phreesia incorporated</td>\n",
+       "      <td>phreesia incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>FRX</td>\n",
+       "      <td>FRX</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2198</td>\n",
+       "      <td>78290</td>\n",
+       "      <td>secureworks corporation</td>\n",
+       "      <td>secureworks corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012146</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>SKRWRKS</td>\n",
+       "      <td>SKRWRKS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2273</td>\n",
+       "      <td>58771</td>\n",
+       "      <td>ryerson holding corporation</td>\n",
+       "      <td>ryerson holding corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>RYRSN HLTNK</td>\n",
+       "      <td>RYRSN HLTNK</td>\n",
        "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
        "    <tr>\n",
-       "      <th>5374</th>\n",
-       "      <td>0.008914</td>\n",
-       "      <td>0.501545</td>\n",
+       "      <th>33</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>6916</td>\n",
-       "      <td>7681</td>\n",
-       "      <td>manitowoc co incorporated</td>\n",
-       "      <td>manitowoc crane companies, llc mcg</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>12.534319</td>\n",
+       "      <td>221</td>\n",
+       "      <td>9106</td>\n",
+       "      <td>comfort systems usa incorporated</td>\n",
+       "      <td>comfort systems usa incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>None</td>\n",
+       "      <td>arkansas</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.001253</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>50.180785</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>MNTWK K INKRPRTT</td>\n",
-       "      <td>MNTWK KRN KMPNS LK MKK</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KMFRT SSTMS US</td>\n",
+       "      <td>KMFRT SSTMS US</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1452</th>\n",
-       "      <td>0.008914</td>\n",
-       "      <td>0.501545</td>\n",
+       "      <th>34</th>\n",
+       "      <td>14.351809</td>\n",
+       "      <td>0.999952</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>3995</td>\n",
-       "      <td>1003</td>\n",
-       "      <td>schneider national, incorporated</td>\n",
-       "      <td>33.schneider logistics, incorporated</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>12.534319</td>\n",
+       "      <td>478</td>\n",
+       "      <td>180383</td>\n",
+       "      <td>winnebago industries incorporated</td>\n",
+       "      <td>winnebago industries incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>minnesota</td>\n",
+       "      <td>minnesota</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.003754</td>\n",
+       "      <td>0.003754</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>57.719048</td>\n",
+       "      <td>WNBK INTSTRS</td>\n",
+       "      <td>WNBK INTSTRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1913</td>\n",
+       "      <td>166068</td>\n",
+       "      <td>renewable energy acquisition corporation</td>\n",
+       "      <td>renewable energy acquisition corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>us</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.000908</td>\n",
+       "      <td>0.556230</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>50.180785</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>SXNTR NXNL INKRPRTT</td>\n",
-       "      <td>SXNTR LJSTKS INKRPRTT</td>\n",
+       "      <td>RNWBL ENRJ AKKSXN</td>\n",
+       "      <td>RNWBL ENRJ AKKSXN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4185</th>\n",
-       "      <td>0.008914</td>\n",
-       "      <td>0.501545</td>\n",
+       "      <th>36</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>485</td>\n",
-       "      <td>6819</td>\n",
-       "      <td>wisconsin electric power company</td>\n",
-       "      <td>wisconsin energy capital corporation</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>12.534319</td>\n",
+       "      <td>257</td>\n",
+       "      <td>164606</td>\n",
+       "      <td>riverview bancorp incorporated</td>\n",
+       "      <td>riverview bancorp incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>washington</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.002996</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>50.180785</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>WSKNSN ELKTRK PWR KMPN</td>\n",
-       "      <td>WSKNSN ENRJ KPTL KRPRXN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>RFRF BNKRP</td>\n",
+       "      <td>RFRF BNKRP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>3907</th>\n",
-       "      <td>0.008914</td>\n",
-       "      <td>0.501545</td>\n",
+       "      <th>37</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>1836</td>\n",
-       "      <td>1390</td>\n",
-       "      <td>orion energy systems, incorporated</td>\n",
-       "      <td>wilson funeral home, incorporated</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>12.534319</td>\n",
+       "      <td>294</td>\n",
+       "      <td>182945</td>\n",
+       "      <td>timberland bancorp incorporated</td>\n",
+       "      <td>timberland bancorp incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>washington</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.002996</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>50.180785</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>ORN ENRJ SSTMS INKRPRTT</td>\n",
-       "      <td>WLSN FNRL HM INKRPRTT</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>TMBRLNT BNKRP</td>\n",
+       "      <td>TMBRLNT BNKRP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>415</td>\n",
+       "      <td>18543</td>\n",
+       "      <td>lkq corporation</td>\n",
+       "      <td>lkq corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>LKK</td>\n",
+       "      <td>LKK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>674</td>\n",
+       "      <td>23252</td>\n",
+       "      <td>berkshire hills bancorp incorporated</td>\n",
+       "      <td>berkshire hills bancorp incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>BRKXR HLS BNKRP</td>\n",
+       "      <td>BRKXR HLS BNKRP</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1270</td>\n",
+       "      <td>181001</td>\n",
+       "      <td>dolby laboratories incorporated</td>\n",
+       "      <td>dolby laboratories incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>california</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.015978</td>\n",
+       "      <td>0.556230</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>TLB LBRTRS</td>\n",
+       "      <td>TLB LBRTRS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1426</th>\n",
-       "      <td>0.008914</td>\n",
-       "      <td>0.501545</td>\n",
+       "      <th>41</th>\n",
+       "      <td>3.252392</td>\n",
+       "      <td>0.905028</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>3995</td>\n",
-       "      <td>1010</td>\n",
-       "      <td>schneider national, incorporated</td>\n",
-       "      <td>40.schneider resources, incorporated</td>\n",
+       "      <td>1321</td>\n",
+       "      <td>132984</td>\n",
+       "      <td>tss incorporated</td>\n",
+       "      <td>dss incorporated</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>12.534319</td>\n",
+       "      <td>2.087284e+05</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.009913</td>\n",
+       "      <td>0.556230</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>wisconsin</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>0.004100</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>50.180785</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>SXNTR NXNL INKRPRTT</td>\n",
-       "      <td>SXNTR RSRSS INKRPRTT</td>\n",
+       "      <td>TS</td>\n",
+       "      <td>TS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
+       "      <th>42</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1482</td>\n",
+       "      <td>46045</td>\n",
+       "      <td>anywhere real estate incorporated</td>\n",
+       "      <td>anywhere real estate incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>ANHR RL ESTT</td>\n",
+       "      <td>ANHR RL ESTT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4672</th>\n",
-       "      <td>13.232266</td>\n",
-       "      <td>0.999896</td>\n",
+       "      <th>43</th>\n",
+       "      <td>6.339909</td>\n",
+       "      <td>0.987805</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>6568</td>\n",
-       "      <td>4608</td>\n",
-       "      <td>wesbanco incorporated</td>\n",
-       "      <td>wesbanco, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>35295.437753</td>\n",
+       "      <td>1494</td>\n",
+       "      <td>47625</td>\n",
+       "      <td>kbr incorporated</td>\n",
+       "      <td>kbr incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>united states</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.012146</td>\n",
+       "      <td>0.556230</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>west virginia</td>\n",
-       "      <td>west virginia</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.001207</td>\n",
-       "      <td>0.001207</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>170.429672</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>WSBNK INKRPRTT</td>\n",
-       "      <td>WSBNK INKRPRTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1829</th>\n",
-       "      <td>13.257062</td>\n",
-       "      <td>0.999898</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>497</td>\n",
-       "      <td>4974</td>\n",
-       "      <td>berkshire hathaway energy company</td>\n",
-       "      <td>berkshire hathaway energy company</td>\n",
-       "      <td>4</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>695779.273116</td>\n",
-       "      <td>0.053272</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.001246</td>\n",
-       "      <td>0.001246</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>165.103745</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>BRKXR H0W ENRJ KMPN</td>\n",
-       "      <td>BRKXR H0W ENRJ KMPN</td>\n",
+       "      <td>KBR</td>\n",
+       "      <td>KBR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1972</td>\n",
+       "      <td>166348</td>\n",
+       "      <td>reshape lifesciences incorporated</td>\n",
+       "      <td>reshape lifesciences incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>RXP LFSSNSS</td>\n",
+       "      <td>RXP LFSSNSS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>45</th>\n",
+       "      <td>12.387018</td>\n",
+       "      <td>0.999813</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1457</td>\n",
+       "      <td>172081</td>\n",
+       "      <td>imperalis holding corporation</td>\n",
+       "      <td>imperalis holding corporation</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>14.786255</td>\n",
+       "      <td>IMPRLS HLTNK</td>\n",
+       "      <td>IMPRLS HLTNK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>46</th>\n",
+       "      <td>12.387018</td>\n",
+       "      <td>0.999813</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2037</td>\n",
+       "      <td>172091</td>\n",
+       "      <td>bitnile metaverse incorporated</td>\n",
+       "      <td>bitnile metaverse incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>nevada</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>0.014652</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>14.786255</td>\n",
+       "      <td>BTNL MTFRS</td>\n",
+       "      <td>BTNL MTFRS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47</th>\n",
+       "      <td>7.717639</td>\n",
+       "      <td>0.995272</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1058</td>\n",
+       "      <td>35808</td>\n",
+       "      <td>qvc incorporated</td>\n",
+       "      <td>qvc incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>2.487467</td>\n",
+       "      <td>0.581079</td>\n",
+       "      <td>KFK</td>\n",
+       "      <td>KFK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6458</th>\n",
-       "      <td>13.550873</td>\n",
-       "      <td>0.999917</td>\n",
+       "      <th>48</th>\n",
+       "      <td>9.692877</td>\n",
+       "      <td>0.998793</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>3842</td>\n",
-       "      <td>749</td>\n",
-       "      <td>shiftpixy, incorporated</td>\n",
-       "      <td>shiftpixy labs, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>0.000005</td>\n",
-       "      <td>35295.437753</td>\n",
+       "      <td>1705</td>\n",
+       "      <td>47703</td>\n",
+       "      <td>irhythm technologies incorporated</td>\n",
+       "      <td>irhythm technologies incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>us delaware</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.372842</td>\n",
+       "      <td>0.000323</td>\n",
+       "      <td>5.683268</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000968</td>\n",
-       "      <td>0.000968</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>212.547350</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>XFTPKS INKRPRTT</td>\n",
-       "      <td>XFTPKS LBS INKRPRTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1330</th>\n",
-       "      <td>13.621474</td>\n",
-       "      <td>0.999921</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>4088</td>\n",
-       "      <td>476</td>\n",
-       "      <td>securetech innovations, incorporated</td>\n",
-       "      <td>securetech innovations, incorporated</td>\n",
-       "      <td>4</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>695779.273116</td>\n",
-       "      <td>0.053272</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000968</td>\n",
-       "      <td>0.000968</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>212.547350</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>SKRTX INFXNS INKRPRTT</td>\n",
-       "      <td>SKRTX INFXNS INKRPRTT</td>\n",
+       "      <td>IRH0M TXNLJS</td>\n",
+       "      <td>IRH0M TXNLJS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6186</th>\n",
-       "      <td>14.206436</td>\n",
-       "      <td>0.999947</td>\n",
+       "      <th>49</th>\n",
+       "      <td>7.186156</td>\n",
+       "      <td>0.993180</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>8116</td>\n",
-       "      <td>2004</td>\n",
-       "      <td>southwestern public service company</td>\n",
-       "      <td>southwestern public service company</td>\n",
-       "      <td>4</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>695779.273116</td>\n",
-       "      <td>0.053272</td>\n",
-       "      <td>new mexico</td>\n",
-       "      <td>new mexico</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000645</td>\n",
-       "      <td>0.000645</td>\n",
-       "      <td>2.32178</td>\n",
-       "      <td>318.821024</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>S0WSTRN PBLK SRFS KMPN</td>\n",
-       "      <td>S0WSTRN PBLK SRFS KMPN</td>\n",
+       "      <td>338</td>\n",
+       "      <td>13985</td>\n",
+       "      <td>essex property trust incorporated</td>\n",
+       "      <td>essex property trust incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.774257e+06</td>\n",
+       "      <td>maryland</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.007786</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ESKS PRPRT TRST</td>\n",
+       "      <td>ESKS PRPRT TRST</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>7540 rows × 24 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "      match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                        company_name_l                        company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation  report_year_l  report_year_r    company_name_mphone_l    company_name_mphone_r\n",
-       "5374      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         6916         7681             manitowoc co incorporated    manitowoc crane companies, llc mcg                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023         MNTWK K INKRPRTT   MNTWK KRN KMPNS LK MKK\n",
-       "1452      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         3995         1003      schneider national, incorporated  33.schneider logistics, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023      SXNTR NXNL INKRPRTT    SXNTR LJSTKS INKRPRTT\n",
-       "4185      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1          485         6819      wisconsin electric power company  wisconsin energy capital corporation                   1           0.000010           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023   WSKNSN ELKTRK PWR KMPN  WSKNSN ENRJ KPTL KRPRXN\n",
-       "3907      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         1836         1390    orion energy systems, incorporated     wilson funeral home, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023  ORN ENRJ SSTMS INKRPRTT    WLSN FNRL HM INKRPRTT\n",
-       "1426      0.008914           0.501545  __splink__input_table_0  __splink__input_table_1         3995         1010      schneider national, incorporated  40.schneider resources, incorporated                   1           0.000005           0.000005        12.534319                1.000000              wisconsin              wisconsin                           3                   0.004100                   0.004100                  2.32178                       50.180785           2023           2023      SXNTR NXNL INKRPRTT     SXNTR RSRSS INKRPRTT\n",
-       "...            ...                ...                      ...                      ...          ...          ...                                   ...                                   ...                 ...                ...                ...              ...                     ...                    ...                    ...                         ...                        ...                        ...                      ...                             ...            ...            ...                      ...                      ...\n",
-       "4672     13.232266           0.999896  __splink__input_table_0  __splink__input_table_1         6568         4608                 wesbanco incorporated                wesbanco, incorporated                   3           0.000005           0.000005     35295.437753                1.000000          west virginia          west virginia                           3                   0.001207                   0.001207                  2.32178                      170.429672           2023           2023           WSBNK INKRPRTT           WSBNK INKRPRTT\n",
-       "1829     13.257062           0.999898  __splink__input_table_0  __splink__input_table_1          497         4974     berkshire hathaway energy company     berkshire hathaway energy company                   4           0.000010           0.000010    695779.273116                0.053272                   iowa                   iowa                           3                   0.001246                   0.001246                  2.32178                      165.103745           2023           2023      BRKXR H0W ENRJ KMPN      BRKXR H0W ENRJ KMPN\n",
-       "6458     13.550873           0.999917  __splink__input_table_0  __splink__input_table_1         3842          749               shiftpixy, incorporated          shiftpixy labs, incorporated                   3           0.000005           0.000005     35295.437753                1.000000                wyoming                wyoming                           3                   0.000968                   0.000968                  2.32178                      212.547350           2023           2023          XFTPKS INKRPRTT      XFTPKS LBS INKRPRTT\n",
-       "1330     13.621474           0.999921  __splink__input_table_0  __splink__input_table_1         4088          476  securetech innovations, incorporated  securetech innovations, incorporated                   4           0.000010           0.000010    695779.273116                0.053272                wyoming                wyoming                           3                   0.000968                   0.000968                  2.32178                      212.547350           2023           2023    SKRTX INFXNS INKRPRTT    SKRTX INFXNS INKRPRTT\n",
-       "6186     14.206436           0.999947  __splink__input_table_0  __splink__input_table_1         8116         2004   southwestern public service company   southwestern public service company                   4           0.000010           0.000010    695779.273116                0.053272             new mexico             new mexico                           3                   0.000645                   0.000645                  2.32178                      318.821024           2023           2023   S0WSTRN PBLK SRFS KMPN   S0WSTRN PBLK SRFS KMPN\n",
-       "\n",
-       "[7540 rows x 24 columns]"
-      ]
-     },
-     "execution_count": 123,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "preds_df.sort_values(by=\"match_probability\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 238,
-   "id": "255272b6-a5c4-4ab8-bebc-d13e77655938",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['match_weight', 'match_probability', 'source_dataset_l',\n",
-       "       'source_dataset_r', 'record_id_l', 'record_id_r', 'company_name_l',\n",
-       "       'company_name_r', 'gamma_company_name', 'tf_company_name_l',\n",
-       "       'tf_company_name_r', 'bf_company_name', 'bf_tf_adj_company_name',\n",
-       "       'loc_of_incorporation_l', 'loc_of_incorporation_r',\n",
-       "       'gamma_loc_of_incorporation', 'tf_loc_of_incorporation_l',\n",
-       "       'tf_loc_of_incorporation_r', 'bf_loc_of_incorporation',\n",
-       "       'bf_tf_adj_loc_of_incorporation', 'company_name_mphone_l',\n",
-       "       'company_name_mphone_r', 'report_year_l', 'report_year_r'],\n",
-       "      dtype='object')"
+       "    match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                                 company_name_l                                 company_name_r  gamma_company_name  bf_company_name loc_of_incorporation_l loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation  company_name_mphone_l  company_name_mphone_r\n",
+       "0       6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         8180       159390               national instruments corporation               national instruments corporation                   2     1.774257e+06               delaware      republic of korea                           0                   0.372842                   0.000234                 0.556230                        1.000000         NXNL INSTRMNTS         NXNL INSTRMNTS\n",
+       "1       6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         7912       154757                          enbridge incorporated                          enbridge incorporated                   2     1.774257e+06                     a0                alberta                           0                   0.000033                   0.000880                 0.556230                        1.000000                  ENBRJ                  ENBRJ\n",
+       "2       6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         7557       140921          spectrum pharmaceuticals incorporated          spectrum pharmaceuticals incorporated                   2     1.774257e+06               delaware         cayman islands                           0                   0.372842                   0.015387                 0.556230                        1.000000        SPKTRM FRMSTKLS        SPKTRM FRMSTKLS\n",
+       "3       7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         8057       152329         american eagle outfitters incorporated         american eagle outfitters incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079       AMRKN EKL OTFTRS       AMRKN EKL OTFTRS\n",
+       "4      14.126362           0.999944  __splink__input_table_0  __splink__input_table_1         7315        28974                   pruco life insurance company                   pruco life insurance company                   2     1.774257e+06                arizona                arizona                           2                   0.004388                   0.004388                 2.487467                       49.368830          PRK LF INSRNS          PRK LF INSRNS\n",
+       "5       7.186156           0.993180  __splink__input_table_0  __splink__input_table_1         7419       142779        national presto industries incorporated        national presto industries incorporated                   2     1.774257e+06              wisconsin                   None                          -1                   0.004110                        NaN                 1.000000                        1.000000      NXNL PRST INTSTRS      NXNL PRST INTSTRS\n",
+       "6       6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         7387       142016               national bankshares incorporated               national bankshares incorporated                   2     1.774257e+06               virginia  commonwealth virginia                           0                   0.006276                   0.000022                 0.556230                        1.000000            NXNL BNKXRS            NXNL BNKXRS\n",
+       "7      13.610142           0.999920  __splink__input_table_0  __splink__input_table_1         7387       127697               national bankshares incorporated               national bankshares incorporated                   2     1.774257e+06               virginia               virginia                           2                   0.006276                   0.006276                 2.487467                       34.518756            NXNL BNKXRS            NXNL BNKXRS\n",
+       "8       7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         8258       162906          thermo fisher scientific incorporated          thermo fisher scientific incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079         0RM FXR SSNTFK         0RM FXR SSNTFK\n",
+       "9      12.101855           0.999773  __splink__input_table_0  __splink__input_table_1         7428        60197  general motors financial company incorporated  general motors financial company incorporated                   2     1.774257e+06                  texas                  texas                           2                   0.017854                   0.017854                 2.487467                       12.134323        JNRL MTRS FNNXL        JNRL MTRS FNNXL\n",
+       "10      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         8258       163501          thermo fisher scientific incorporated          thermo fisher scientific incorporated                   2     1.774257e+06               delaware                 mexico                           0                   0.372842                   0.011205                 0.556230                        1.000000         0RM FXR SSNTFK         0RM FXR SSNTFK\n",
+       "11      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         5498        52885             apollo strategic growth capital ii             apollo strategic growth capital ii                   2     1.774257e+06                     e9         cayman islands                           0                   0.001069                   0.015387                 0.556230                        1.000000   APL STRTJK KR0 KPTL    APL STRTJK KR0 KPTL \n",
+       "12      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         8258       162892          thermo fisher scientific incorporated          thermo fisher scientific incorporated                   2     1.774257e+06               delaware                 canada                           0                   0.372842                   0.012191                 0.556230                        1.000000         0RM FXR SSNTFK         0RM FXR SSNTFK\n",
+       "13      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         8258       162847          thermo fisher scientific incorporated          thermo fisher scientific incorporated                   2     1.774257e+06               delaware                 russia                           0                   0.372842                   0.001108                 0.556230                        1.000000         0RM FXR SSNTFK         0RM FXR SSNTFK\n",
+       "14      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1          498        18301                     intellinetics incorporated                     intellinetics incorporated                   2     1.774257e+06                 nevada                   ohio                           0                   0.014652                   0.008136                 0.556230                        1.000000               INTLNTKS               INTLNTKS\n",
+       "15      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         1533       165897          high sierra technologies incorporated          high sierra technologies incorporated                   2     1.774257e+06               colorado                 nevada                           0                   0.004817                   0.014652                 0.556230                        1.000000            H SR TXNLJS            H SR TXNLJS\n",
+       "16     13.991858           0.999939  __splink__input_table_0  __splink__input_table_1         2127        61213                        lnpr group incorporated                        lnpr group incorporated                   2     1.774257e+06               colorado               colorado                           2                   0.004817                   0.004817                 2.487467                       44.974148               LNPR KRP               LNPR KRP\n",
+       "17      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1           93         1969                  norwood financial corporation                  norwood financial corporation                   2     1.774257e+06           pennsylvania                   None                          -1                   0.007919                        NaN                 1.000000                        1.000000             NRWT FNNXL             NRWT FNNXL\n",
+       "18      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1          151         2257                               nov incorporated                               nov incorporated                   2     1.774257e+06               delaware              mauritius                           0                   0.372842                   0.001075                 0.556230                        1.000000                     NF                     NF\n",
+       "19      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1          280        10975                  juniper networks incorporated                  juniper networks incorporated                   2     1.774257e+06               delaware        california, usa                           0                   0.372842                   0.000234                 0.556230                        1.000000            JNPR NTWRKS            JNPR NTWRKS\n",
+       "20      3.252392           0.905028  __splink__input_table_0  __splink__input_table_1         1399       157790                             logiq incorporated                            logiq3 incorporated                   1     2.087284e+05               delaware                 canada                           0                   0.372842                   0.012191                 0.556230                        1.000000                    LJK                    LJK\n",
+       "21      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         1720       166283                             edgio incorporated                             edgio incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079                     EJ                     EJ\n",
+       "22      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         2020       184709                       arem pacific corporation                       arem pacific corporation                   2     1.774257e+06               delaware                arizona                           0                   0.372842                   0.004388                 0.556230                        1.000000               ARM PSFK               ARM PSFK\n",
+       "23      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1          756        26596                      ensign group incorporated                      ensign group incorporated                   2     1.774257e+06                   None                 nevada                          -1                        NaN                   0.014652                 1.000000                        1.000000              ENSKN KRP              ENSKN KRP\n",
+       "24      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1         1104        24668         cco holdings limited liability company         cco holdings limited liability company                   2     1.774257e+06                   None               delaware                          -1                        NaN                   0.372842                 1.000000                        1.000000              KK HLTNKS              KK HLTNKS\n",
+       "25      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1          321        11011                     pc connection incorporated                     pc connection incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079                  KNKXN                  KNKXN\n",
+       "26      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1          477        14483                        polarityte incorporated                        polarityte incorporated                   2     1.774257e+06               delaware                 nevada                           0                   0.372842                   0.014652                 0.556230                        1.000000                  PLRTT                  PLRTT\n",
+       "27      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1          810        25991      atlas air worldwide holdings incorporated      atlas air worldwide holdings incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079  ATLS AR WRLTWT HLTNKS  ATLS AR WRLTWT HLTNKS\n",
+       "28      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         1003       166010                          spi energy co limited                          spi energy co limited                   2     1.774257e+06                     e9                 cayman                           0                   0.001069                   0.000345                 0.556230                        1.000000                SP ENRJ                SP ENRJ\n",
+       "29      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         1012       165926        bimi international medical incorporated        bimi international medical incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079       BM INTRNXNL MTKL       BM INTRNXNL MTKL\n",
+       "30      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1         1868        51876                          phreesia incorporated                          phreesia incorporated                   2     1.774257e+06               delaware                   None                          -1                   0.372842                        NaN                 1.000000                        1.000000                    FRX                    FRX\n",
+       "31      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         2198        78290                        secureworks corporation                        secureworks corporation                   2     1.774257e+06               delaware          united states                           0                   0.372842                   0.012146                 0.556230                        1.000000                SKRWRKS                SKRWRKS\n",
+       "32      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         2273        58771                    ryerson holding corporation                    ryerson holding corporation                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079            RYRSN HLTNK            RYRSN HLTNK\n",
+       "33      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1          221         9106               comfort systems usa incorporated               comfort systems usa incorporated                   2     1.774257e+06                   None               arkansas                          -1                        NaN                   0.001253                 1.000000                        1.000000         KMFRT SSTMS US         KMFRT SSTMS US\n",
+       "34     14.351809           0.999952  __splink__input_table_0  __splink__input_table_1          478       180383              winnebago industries incorporated              winnebago industries incorporated                   2     1.774257e+06              minnesota              minnesota                           2                   0.003754                   0.003754                 2.487467                       57.719048           WNBK INTSTRS           WNBK INTSTRS\n",
+       "35      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         1913       166068       renewable energy acquisition corporation       renewable energy acquisition corporation                   2     1.774257e+06                 nevada                     us                           0                   0.014652                   0.000908                 0.556230                        1.000000      RNWBL ENRJ AKKSXN      RNWBL ENRJ AKKSXN\n",
+       "36      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1          257       164606                 riverview bancorp incorporated                 riverview bancorp incorporated                   2     1.774257e+06             washington                   None                          -1                   0.002996                        NaN                 1.000000                        1.000000             RFRF BNKRP             RFRF BNKRP\n",
+       "37      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1          294       182945                timberland bancorp incorporated                timberland bancorp incorporated                   2     1.774257e+06             washington                   None                          -1                   0.002996                        NaN                 1.000000                        1.000000          TMBRLNT BNKRP          TMBRLNT BNKRP\n",
+       "38      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1          415        18543                                lkq corporation                                lkq corporation                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079                    LKK                    LKK\n",
+       "39      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1          674        23252           berkshire hills bancorp incorporated           berkshire hills bancorp incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079        BRKXR HLS BNKRP        BRKXR HLS BNKRP\n",
+       "40      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         1270       181001                dolby laboratories incorporated                dolby laboratories incorporated                   2     1.774257e+06               delaware             california                           0                   0.372842                   0.015978                 0.556230                        1.000000             TLB LBRTRS             TLB LBRTRS\n",
+       "41      3.252392           0.905028  __splink__input_table_0  __splink__input_table_1         1321       132984                               tss incorporated                               dss incorporated                   1     2.087284e+05               delaware               new york                           0                   0.372842                   0.009913                 0.556230                        1.000000                     TS                     TS\n",
+       "42      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         1482        46045              anywhere real estate incorporated              anywhere real estate incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079           ANHR RL ESTT           ANHR RL ESTT\n",
+       "43      6.339909           0.987805  __splink__input_table_0  __splink__input_table_1         1494        47625                               kbr incorporated                               kbr incorporated                   2     1.774257e+06               delaware          united states                           0                   0.372842                   0.012146                 0.556230                        1.000000                    KBR                    KBR\n",
+       "44      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         1972       166348              reshape lifesciences incorporated              reshape lifesciences incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079            RXP LFSSNSS            RXP LFSSNSS\n",
+       "45     12.387018           0.999813  __splink__input_table_0  __splink__input_table_1         1457       172081                  imperalis holding corporation                  imperalis holding corporation                   2     1.774257e+06                 nevada                 nevada                           2                   0.014652                   0.014652                 2.487467                       14.786255           IMPRLS HLTNK           IMPRLS HLTNK\n",
+       "46     12.387018           0.999813  __splink__input_table_0  __splink__input_table_1         2037       172091                 bitnile metaverse incorporated                 bitnile metaverse incorporated                   2     1.774257e+06                 nevada                 nevada                           2                   0.014652                   0.014652                 2.487467                       14.786255             BTNL MTFRS             BTNL MTFRS\n",
+       "47      7.717639           0.995272  __splink__input_table_0  __splink__input_table_1         1058        35808                               qvc incorporated                               qvc incorporated                   2     1.774257e+06               delaware               delaware                           2                   0.372842                   0.372842                 2.487467                        0.581079                    KFK                    KFK\n",
+       "48      9.692877           0.998793  __splink__input_table_0  __splink__input_table_1         1705        47703              irhythm technologies incorporated              irhythm technologies incorporated                   2     1.774257e+06               delaware            us delaware                           1                   0.372842                   0.000323                 5.683268                        1.000000           IRH0M TXNLJS           IRH0M TXNLJS\n",
+       "49      7.186156           0.993180  __splink__input_table_0  __splink__input_table_1          338        13985              essex property trust incorporated              essex property trust incorporated                   2     1.774257e+06               maryland                   None                          -1                   0.007786                        NaN                 1.000000                        1.000000        ESKS PRPRT TRST        ESKS PRPRT TRST"
       ]
      },
-     "execution_count": 238,
+     "execution_count": 109,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preds_df.columns"
+    "preds_df[preds_df.match_probability > .9]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 249,
+   "execution_count": 79,
    "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2",
    "metadata": {
     "tags": []
@@ -2908,579 +5071,587 @@
        "      <th>match_probability</th>\n",
        "      <th>company_name_l</th>\n",
        "      <th>company_name_r</th>\n",
-       "      <th>loc_of_incorporation_l</th>\n",
-       "      <th>loc_of_incorporation_r</th>\n",
+       "      <th>loc_list_l</th>\n",
+       "      <th>loc_list_r</th>\n",
        "      <th>company_name_mphone_l</th>\n",
        "      <th>company_name_mphone_r</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>150</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>santander drive auto receivables trust 2018-1</td>\n",
-       "      <td>santander drive auto receivables trust</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <th>465</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>conns incorporated</td>\n",
+       "      <td>invenco incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>KNS</td>\n",
+       "      <td>INFNK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>151</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>santander drive auto receivables trust 2018-5</td>\n",
-       "      <td>santander drive auto receivables trust</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <th>466</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>vishay intertechnology incorporated</td>\n",
+       "      <td>vishay precision foil, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>FX INTRTXNLJ</td>\n",
+       "      <td>FX PRSXN FL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>152</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>santander drive auto receivables trust 2018-3</td>\n",
-       "      <td>santander drive auto receivables trust</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <th>467</th>\n",
+       "      <td>0.980607</td>\n",
+       "      <td>vishay precision group, incorporated</td>\n",
+       "      <td>vishay precision foil, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>FX PRSXN KRP</td>\n",
+       "      <td>FX PRSXN FL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>153</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>santander drive auto receivables trust 2016-1</td>\n",
-       "      <td>santander drive auto receivables trust</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
-       "      <td>SNTNTR TRF AT RSFBLS TRST</td>\n",
+       "      <th>470</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>jones lang lasalle limited</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[hong, kong]</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>154</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>constellation pharmaceuticals inc</td>\n",
-       "      <td>constellation connect, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KNSTLXN FRMSTKLS INK</td>\n",
-       "      <td>KNSTLXN KNKT LK</td>\n",
+       "      <th>471</th>\n",
+       "      <td>0.951657</td>\n",
+       "      <td>nrg energy, incorporated</td>\n",
+       "      <td>nrg energy, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>NRK ENRJ</td>\n",
+       "      <td>NRK ENRJ</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>162</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>illinois</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>472</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>firstenergy corporation</td>\n",
+       "      <td>firstenergy ventures corporation</td>\n",
+       "      <td>[ohio]</td>\n",
+       "      <td>[ohio]</td>\n",
+       "      <td>FRSTNRJ</td>\n",
+       "      <td>FRSTNRJ FNTRS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>163</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>missouri</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>478</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>hudson pacific properties, incorporated</td>\n",
+       "      <td>hudson pacific services, incorporated</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>HTSN PSFK PRPRTS</td>\n",
+       "      <td>HTSN PSFK SRFSS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>164</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>maine</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>479</th>\n",
+       "      <td>0.980607</td>\n",
+       "      <td>hudson pacific properties, incorporated</td>\n",
+       "      <td>hudson pacific properties, limited partnership</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>HTSN PSFK PRPRTS</td>\n",
+       "      <td>HTSN PSFK PRPRTS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>165</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>kansas</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>481</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>digital ally, incorporated</td>\n",
+       "      <td>digital ally international, incorporated</td>\n",
+       "      <td>[nevada]</td>\n",
+       "      <td>[nevada]</td>\n",
+       "      <td>TJTL AL</td>\n",
+       "      <td>TJTL AL INTRNXNL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>166</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>minnesota</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>489</th>\n",
+       "      <td>0.976947</td>\n",
+       "      <td>cco holdings limited liability company</td>\n",
+       "      <td>rhfw holdings, limited liability company</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>KK HLTNKS</td>\n",
+       "      <td>RHF HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>167</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>central</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>493</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>intuitive surgical incorporated</td>\n",
+       "      <td>intuitive surgical limited</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[united, kingdom]</td>\n",
+       "      <td>INTTF SRJKL</td>\n",
+       "      <td>INTTF SRJKL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>168</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>florida</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>494</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>jones lang lasalle limited</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[england]</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>169</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>consolidated communications holdings, inc.</td>\n",
-       "      <td>consolidated communications of</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>california</td>\n",
-       "      <td>KNSLTTT KMNKXNS HLTNKS INK</td>\n",
-       "      <td>KNSLTTT KMNKXNS OF</td>\n",
+       "      <th>500</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>becton dickinson and company</td>\n",
+       "      <td>becton, dickinson and company, limited</td>\n",
+       "      <td>[new, jersey]</td>\n",
+       "      <td>[ireland]</td>\n",
+       "      <td>BKTN TKNSN ANT</td>\n",
+       "      <td>BKTN TKNSN ANT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>174</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy one, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ ON INK</td>\n",
+       "      <th>501</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>united parcel service incorporated</td>\n",
+       "      <td>united guaranty services, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[north, carolina]</td>\n",
+       "      <td>UNTT PRSL SRFS</td>\n",
+       "      <td>UNTT KRNT SRFSS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>177</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>verus international, inc.</td>\n",
-       "      <td>emcor international, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>FRS INTRNXNL INK</td>\n",
-       "      <td>EMKR INTRNXNL INK</td>\n",
+       "      <th>509</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>estee lauder companies incorporated</td>\n",
+       "      <td>estee lauder international, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EST LTR KMPNS</td>\n",
+       "      <td>EST LTR INTRNXNL</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>178</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>verus international, inc.</td>\n",
-       "      <td>emcor international, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>FRS INTRNXNL INK</td>\n",
-       "      <td>EMKR INTRNXNL INK</td>\n",
+       "      <th>510</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>maxcyte, incorporated</td>\n",
+       "      <td>cues, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>MKSST</td>\n",
+       "      <td>KS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>179</th>\n",
-       "      <td>0.714594</td>\n",
-       "      <td>green plains inc.</td>\n",
-       "      <td>green plains superior llc fka superior</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>KRN PLNS INK</td>\n",
-       "      <td>KRN PLNS SPRR LK FK SPRR</td>\n",
+       "      <th>515</th>\n",
+       "      <td>0.980607</td>\n",
+       "      <td>zimmer biomet holdings, incorporated</td>\n",
+       "      <td>zimmer biomet spine, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>SMR BMT HLTNKS</td>\n",
+       "      <td>SMR BMT SPN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>183</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy group, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ KRP LK</td>\n",
+       "      <th>518</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>nordicus partners corporation</td>\n",
+       "      <td>nordco enterprises, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[wilmington, delaware]</td>\n",
+       "      <td>NRTKS PRTNRS</td>\n",
+       "      <td>NRTK ENTRPRSS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>195</th>\n",
-       "      <td>0.884993</td>\n",
-       "      <td>green stream holdings inc.</td>\n",
-       "      <td>western gas wyoming, l.l.c</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>KRN STRM HLTNKS INK</td>\n",
-       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "      <th>519</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>valero energy corp/tx</td>\n",
+       "      <td>valero energy incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[canada]</td>\n",
+       "      <td>FLR ENRJ TKS</td>\n",
+       "      <td>FLR ENRJ</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>196</th>\n",
-       "      <td>0.884993</td>\n",
-       "      <td>green stream holdings inc.</td>\n",
-       "      <td>western gas wyoming, l.l.c</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>KRN STRM HLTNKS INK</td>\n",
-       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "      <th>527</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>nrg energy, incorporated</td>\n",
+       "      <td>nrg energy holdings incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>NRK ENRJ</td>\n",
+       "      <td>NRK ENRJ HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>197</th>\n",
-       "      <td>0.992184</td>\n",
-       "      <td>fortress biotech, inc.</td>\n",
-       "      <td>fortress biotech, china, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>None</td>\n",
-       "      <td>FRTRS BTX INK</td>\n",
-       "      <td>FRTRS BTX XN INK</td>\n",
+       "      <th>528</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>everi holdings incorporated</td>\n",
+       "      <td>edi holdings, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EFR HLTNKS</td>\n",
+       "      <td>ET HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>199</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy china corp</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ XN KRP</td>\n",
+       "      <th>535</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>estee lauder companies incorporated</td>\n",
+       "      <td>estee lauder incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EST LTR KMPNS</td>\n",
+       "      <td>EST LTR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>200</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy corporate services, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ KRPRT SRFSS INK</td>\n",
+       "      <th>548</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>universal logistics holdings, incorporated</td>\n",
+       "      <td>universal logistics corporation</td>\n",
+       "      <td>[michigan]</td>\n",
+       "      <td>[florida]</td>\n",
+       "      <td>UNFRSL LJSTKS HLTNKS</td>\n",
+       "      <td>UNFRSL LJSTKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>203</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>apollo global management, inc.</td>\n",
-       "      <td>apollo belenos management llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>APL KLBL MNJMNT INK</td>\n",
-       "      <td>APL BLNS MNJMNT LK</td>\n",
+       "      <th>551</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>alliant energy corporation</td>\n",
+       "      <td>allergan gi corporation</td>\n",
+       "      <td>[wisconsin]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>ALNT ENRJ</td>\n",
+       "      <td>ALRKN J</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>204</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>apollo global management, inc.</td>\n",
-       "      <td>apollo belenos management llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>APL KLBL MNJMNT INK</td>\n",
-       "      <td>APL BLNS MNJMNT LK</td>\n",
+       "      <th>555</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>smartmetric, incorporated</td>\n",
+       "      <td>smartpetro incorporated</td>\n",
+       "      <td>[nevada]</td>\n",
+       "      <td>[philippines]</td>\n",
+       "      <td>SMRTMTRK</td>\n",
+       "      <td>SMRTPTR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>206</th>\n",
-       "      <td>0.981099</td>\n",
-       "      <td>columbia property trust, inc.</td>\n",
-       "      <td>columbia courtyard, inc</td>\n",
-       "      <td>maryland</td>\n",
-       "      <td>maryland</td>\n",
-       "      <td>KLMB PRPRT TRST INK</td>\n",
-       "      <td>KLMB KRTYRT INK</td>\n",
+       "      <th>566</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>republic services, incorporated</td>\n",
+       "      <td>republic conduit, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>RPBLK SRFSS</td>\n",
+       "      <td>RPBLK KNTT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>208</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy beckjord, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ BKJRT LK</td>\n",
+       "      <th>571</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>freedom holdings, incorporated</td>\n",
+       "      <td>freedom designs, incorporated</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[california]</td>\n",
+       "      <td>FRTM HLTNKS</td>\n",
+       "      <td>FRTM TSKNS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>209</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy beckjord storage llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ BKJRT STRJ LK</td>\n",
+       "      <th>573</th>\n",
+       "      <td>0.938457</td>\n",
+       "      <td>ares real estate income trust incorporated</td>\n",
+       "      <td>ares real estate income trust incorporated</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>ARS RL ESTT INKM TRST</td>\n",
+       "      <td>ARS RL ESTT INKM TRST</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>210</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy acp, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ AKP LK</td>\n",
+       "      <th>574</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>bank of new york mellon corporation</td>\n",
+       "      <td>bank of new york mellon sa/nv</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[belgium]</td>\n",
+       "      <td>BNK OF N YRK MLN</td>\n",
+       "      <td>BNK OF N YRK MLN SNF</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>213</th>\n",
-       "      <td>0.981099</td>\n",
-       "      <td>spirit realty capital, inc.</td>\n",
-       "      <td>spirit reit, inc</td>\n",
-       "      <td>maryland</td>\n",
-       "      <td>maryland</td>\n",
-       "      <td>SPRT RLT KPTL INK</td>\n",
-       "      <td>SPRT RT INK</td>\n",
+       "      <th>576</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>southern company</td>\n",
+       "      <td>southern wood piedmont company</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>S0RN</td>\n",
+       "      <td>S0RN WT PTMNT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>215</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>apollo global management, inc.</td>\n",
-       "      <td>apollo na management ii, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>APL KLBL MNJMNT INK</td>\n",
-       "      <td>APL N MNJMNT LK</td>\n",
+       "      <th>582</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>ameresco, incorporated</td>\n",
+       "      <td>ameripath, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>AMRSK</td>\n",
+       "      <td>AMRP0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>216</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>apollo global management, inc.</td>\n",
-       "      <td>apollo na management ii, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>APL KLBL MNJMNT INK</td>\n",
-       "      <td>APL N MNJMNT LK</td>\n",
+       "      <th>584</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>trevena incorporated</td>\n",
+       "      <td>anr, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>TRFN</td>\n",
+       "      <td>ANR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>225</th>\n",
-       "      <td>0.992184</td>\n",
-       "      <td>fortress biotech, inc.</td>\n",
-       "      <td>fortress biotech, china, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>None</td>\n",
-       "      <td>FRTRS BTX INK</td>\n",
-       "      <td>FRTRS BTX XN INK</td>\n",
+       "      <th>590</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>bank of new york mellon corporation</td>\n",
+       "      <td>bank of new york mellon</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[new, york]</td>\n",
+       "      <td>BNK OF N YRK MLN</td>\n",
+       "      <td>BNK OF N YRK MLN</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>226</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>green brick partners, inc.</td>\n",
-       "      <td>green brick mortgage, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KRN BRK PRTNRS INK</td>\n",
-       "      <td>KRN BRK MRTKJ LK</td>\n",
+       "      <th>591</th>\n",
+       "      <td>0.938457</td>\n",
+       "      <td>xerox holdings corporation</td>\n",
+       "      <td>xerox holdings corporation</td>\n",
+       "      <td>[connecticut]</td>\n",
+       "      <td>[new, york]</td>\n",
+       "      <td>SRKS HLTNKS</td>\n",
+       "      <td>SRKS HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>227</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy beckjord storage llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ BKJRT STRJ LK</td>\n",
+       "      <th>594</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>jones lang lasalle incorporated</td>\n",
+       "      <td>jones lang lasalle ip, incorporated</td>\n",
+       "      <td>[maryland]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>JNS LNK LSL</td>\n",
+       "      <td>JNS LNK LSL IP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>228</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>green plains inc.</td>\n",
-       "      <td>green plains madison llc</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KRN PLNS INK</td>\n",
-       "      <td>KRN PLNS MTSN LK</td>\n",
+       "      <th>595</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>iron mountain incorporated</td>\n",
+       "      <td>iron mountain global holdings, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>IRN MNTN</td>\n",
+       "      <td>IRN MNTN KLBL HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>242</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>great lakes dredge &amp; dock corp</td>\n",
-       "      <td>great lakes dredge &amp; dock do brasil ltda</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>brazil</td>\n",
-       "      <td>KRT LKS TRJ TK KRP</td>\n",
-       "      <td>KRT LKS TRJ TK T BRSL LTT</td>\n",
+       "      <th>597</th>\n",
+       "      <td>0.980607</td>\n",
+       "      <td>extreme networks incorporated</td>\n",
+       "      <td>extreme networks ihc, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EKSTRM NTWRKS</td>\n",
+       "      <td>EKSTRM NTWRKS IK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>243</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>great lakes dredge &amp; dock corp</td>\n",
-       "      <td>great lakes dredge &amp; dock environmental, inc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KRT LKS TRJ TK KRP</td>\n",
-       "      <td>KRT LKS TRJ TK ENFRNMNTL INK</td>\n",
+       "      <th>599</th>\n",
+       "      <td>0.976947</td>\n",
+       "      <td>q2 holdings, incorporated</td>\n",
+       "      <td>vr holdings, incorporated</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>[colorado]</td>\n",
+       "      <td>K HLTNKS</td>\n",
+       "      <td>FR HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>244</th>\n",
-       "      <td>0.996128</td>\n",
-       "      <td>great lakes dredge &amp; dock corp</td>\n",
-       "      <td>great lakes dredge &amp; dock company, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KRT LKS TRJ TK KRP</td>\n",
-       "      <td>KRT LKS TRJ TK KMPN LK</td>\n",
+       "      <th>600</th>\n",
+       "      <td>0.980607</td>\n",
+       "      <td>extreme networks incorporated</td>\n",
+       "      <td>extreme networks, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EKSTRM NTWRKS</td>\n",
+       "      <td>EKSTRM NTWRKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>251</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>blackstone group inc</td>\n",
-       "      <td>blackstone pb ii l.l.c</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>BLKSTN KRP INK</td>\n",
-       "      <td>BLKSTN PB LLK</td>\n",
+       "      <th>604</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>cutera incorporated</td>\n",
+       "      <td>vrec, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>KTR</td>\n",
+       "      <td>FRK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>252</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>blackstone group inc</td>\n",
-       "      <td>blackstone pb i l.l.c</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>BLKSTN KRP INK</td>\n",
-       "      <td>BLKSTN PB I LLK</td>\n",
+       "      <th>605</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>assured guaranty limited</td>\n",
+       "      <td>assured guaranty services limited</td>\n",
+       "      <td>[d0]</td>\n",
+       "      <td>[england]</td>\n",
+       "      <td>ASRT KRNT</td>\n",
+       "      <td>ASRT KRNT SRFSS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>254</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy acp, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ AKP LK</td>\n",
+       "      <th>606</th>\n",
+       "      <td>0.976947</td>\n",
+       "      <td>virtra, incorporated</td>\n",
+       "      <td>viator, incorporated</td>\n",
+       "      <td>[nevada]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>FRTR</td>\n",
+       "      <td>FTR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>255</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy shoreham, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ XRHM LK</td>\n",
+       "      <th>618</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>sculptor capital management, incorporated</td>\n",
+       "      <td>sculptor capital management hong kong limited</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[hong, kong]</td>\n",
+       "      <td>SKLPTR KPTL MNJMNT</td>\n",
+       "      <td>SKLPTR KPTL MNJMNT HNK KNK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>256</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>duke energy corp</td>\n",
-       "      <td>duke energy sam, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>TK ENRJ KRP</td>\n",
-       "      <td>TK ENRJ SM LK</td>\n",
+       "      <th>625</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>enstar group limited</td>\n",
+       "      <td>enstar limited</td>\n",
+       "      <td>[d0]</td>\n",
+       "      <td>[bermuda]</td>\n",
+       "      <td>ENSTR KRP</td>\n",
+       "      <td>ENSTR</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>257</th>\n",
-       "      <td>0.573277</td>\n",
-       "      <td>blackstone group inc</td>\n",
-       "      <td>blackstone obs l.l.c</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>BLKSTN KRP INK</td>\n",
-       "      <td>BLKSTN OBS LLK</td>\n",
+       "      <th>626</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>sellas life sciences group, incorporated</td>\n",
+       "      <td>sellas life sciences group limited</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[bermuda]</td>\n",
+       "      <td>SLS LF SSNSS KRP</td>\n",
+       "      <td>SLS LF SSNSS KRP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>264</th>\n",
-       "      <td>0.992184</td>\n",
-       "      <td>freightcar america, inc.</td>\n",
-       "      <td>freightcar america leasing, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>None</td>\n",
-       "      <td>FRTKR AMRK INK</td>\n",
-       "      <td>FRTKR AMRK LSNK LK</td>\n",
+       "      <th>627</th>\n",
+       "      <td>0.975104</td>\n",
+       "      <td>intuitive surgical incorporated</td>\n",
+       "      <td>intuitive surgical canada incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[canada]</td>\n",
+       "      <td>INTTF SRJKL</td>\n",
+       "      <td>INTTF SRJKL KNT</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>265</th>\n",
-       "      <td>0.992184</td>\n",
-       "      <td>freightcar america, inc.</td>\n",
-       "      <td>freightcar america leasing, llc</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>None</td>\n",
-       "      <td>FRTKR AMRK INK</td>\n",
-       "      <td>FRTKR AMRK LSNK LK</td>\n",
+       "      <th>630</th>\n",
+       "      <td>0.951657</td>\n",
+       "      <td>forestar group incorporated</td>\n",
+       "      <td>forestar group incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>FRSTR KRP</td>\n",
+       "      <td>FRSTR KRP</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>266</th>\n",
-       "      <td>0.959568</td>\n",
-       "      <td>qurate retail, inc.</td>\n",
-       "      <td>qurate retail group, inc</td>\n",
-       "      <td>englewood</td>\n",
-       "      <td>de</td>\n",
-       "      <td>KRT RTL INK</td>\n",
-       "      <td>KRT RTL KRP INK</td>\n",
+       "      <th>637</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>dcp midstream, limited partnership</td>\n",
+       "      <td>dcp midstream operating, limited partnership</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>TKP MTSTRM</td>\n",
+       "      <td>TKP MTSTRM OPRTNK</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>267</th>\n",
-       "      <td>0.884993</td>\n",
-       "      <td>green stream holdings inc.</td>\n",
-       "      <td>western gas wyoming, l.l.c</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>KRN STRM HLTNKS INK</td>\n",
-       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "      <th>639</th>\n",
+       "      <td>0.951657</td>\n",
+       "      <td>equitable holdings, incorporated</td>\n",
+       "      <td>equitable holdings, incorporated</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>EKTBL HLTNKS</td>\n",
+       "      <td>EKTBL HLTNKS</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>268</th>\n",
-       "      <td>0.884993</td>\n",
-       "      <td>green stream holdings inc.</td>\n",
-       "      <td>western gas wyoming, l.l.c</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>wyoming</td>\n",
-       "      <td>KRN STRM HLTNKS INK</td>\n",
-       "      <td>WSTRN KS YMNK LLK</td>\n",
+       "      <th>643</th>\n",
+       "      <td>0.914612</td>\n",
+       "      <td>energy transfer limited partnership</td>\n",
+       "      <td>energy transfer partners, limited liability co...</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>[delaware]</td>\n",
+       "      <td>ENRJ TRNSFR</td>\n",
+       "      <td>ENRJ TRNSFR PRTNRS</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     match_probability                                 company_name_l                                company_name_r loc_of_incorporation_l loc_of_incorporation_r       company_name_mphone_l         company_name_mphone_r\n",
-       "150           0.996128  santander drive auto receivables trust 2018-1        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
-       "151           0.996128  santander drive auto receivables trust 2018-5        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
-       "152           0.996128  santander drive auto receivables trust 2018-3        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
-       "153           0.996128  santander drive auto receivables trust 2016-1        santander drive auto receivables trust               delaware               delaware  SNTNTR TRF AT RSFBLS TRST      SNTNTR TRF AT RSFBLS TRST\n",
-       "154           0.573277              constellation pharmaceuticals inc                    constellation connect, llc               delaware               delaware        KNSTLXN FRMSTKLS INK               KNSTLXN KNKT LK\n",
-       "162           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware               illinois  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "163           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware               missouri  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "164           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                  maine  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "165           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                 kansas  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "166           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware              minnesota  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "167           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                central  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "168           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware                florida  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "169           0.959568     consolidated communications holdings, inc.                consolidated communications of               delaware             california  KNSLTTT KMNKXNS HLTNKS INK            KNSLTTT KMNKXNS OF\n",
-       "174           0.573277                               duke energy corp                          duke energy one, inc               delaware               delaware                 TK ENRJ KRP                TK ENRJ ON INK\n",
-       "177           0.573277                      verus international, inc.                      emcor international, inc               delaware               delaware            FRS INTRNXNL INK             EMKR INTRNXNL INK\n",
-       "178           0.573277                      verus international, inc.                      emcor international, inc               delaware               delaware            FRS INTRNXNL INK             EMKR INTRNXNL INK\n",
-       "179           0.714594                              green plains inc.        green plains superior llc fka superior                   iowa                   iowa                KRN PLNS INK      KRN PLNS SPRR LK FK SPRR\n",
-       "183           0.996128                               duke energy corp                        duke energy group, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ KRP LK\n",
-       "195           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
-       "196           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
-       "197           0.992184                         fortress biotech, inc.                  fortress biotech, china, inc               delaware                   None               FRTRS BTX INK              FRTRS BTX XN INK\n",
-       "199           0.996128                               duke energy corp                        duke energy china corp               delaware               delaware                 TK ENRJ KRP                TK ENRJ XN KRP\n",
-       "200           0.573277                               duke energy corp           duke energy corporate services, inc               delaware               delaware                 TK ENRJ KRP       TK ENRJ KRPRT SRFSS INK\n",
-       "203           0.573277                 apollo global management, inc.                 apollo belenos management llc               delaware               delaware         APL KLBL MNJMNT INK            APL BLNS MNJMNT LK\n",
-       "204           0.573277                 apollo global management, inc.                 apollo belenos management llc               delaware               delaware         APL KLBL MNJMNT INK            APL BLNS MNJMNT LK\n",
-       "206           0.981099                  columbia property trust, inc.                       columbia courtyard, inc               maryland               maryland         KLMB PRPRT TRST INK               KLMB KRTYRT INK\n",
-       "208           0.573277                               duke energy corp                     duke energy beckjord, llc               delaware               delaware                 TK ENRJ KRP              TK ENRJ BKJRT LK\n",
-       "209           0.573277                               duke energy corp              duke energy beckjord storage llc               delaware               delaware                 TK ENRJ KRP         TK ENRJ BKJRT STRJ LK\n",
-       "210           0.573277                               duke energy corp                          duke energy acp, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ AKP LK\n",
-       "213           0.981099                    spirit realty capital, inc.                              spirit reit, inc               maryland               maryland           SPRT RLT KPTL INK                   SPRT RT INK\n",
-       "215           0.573277                 apollo global management, inc.                  apollo na management ii, llc               delaware               delaware         APL KLBL MNJMNT INK               APL N MNJMNT LK\n",
-       "216           0.573277                 apollo global management, inc.                  apollo na management ii, llc               delaware               delaware         APL KLBL MNJMNT INK               APL N MNJMNT LK\n",
-       "225           0.992184                         fortress biotech, inc.                  fortress biotech, china, inc               delaware                   None               FRTRS BTX INK              FRTRS BTX XN INK\n",
-       "226           0.573277                     green brick partners, inc.                     green brick mortgage, llc               delaware               delaware          KRN BRK PRTNRS INK              KRN BRK MRTKJ LK\n",
-       "227           0.573277                               duke energy corp              duke energy beckjord storage llc               delaware               delaware                 TK ENRJ KRP         TK ENRJ BKJRT STRJ LK\n",
-       "228           0.959568                              green plains inc.                      green plains madison llc                   iowa               delaware                KRN PLNS INK              KRN PLNS MTSN LK\n",
-       "242           0.959568                 great lakes dredge & dock corp      great lakes dredge & dock do brasil ltda               delaware                 brazil          KRT LKS TRJ TK KRP     KRT LKS TRJ TK T BRSL LTT\n",
-       "243           0.573277                 great lakes dredge & dock corp  great lakes dredge & dock environmental, inc               delaware               delaware          KRT LKS TRJ TK KRP  KRT LKS TRJ TK ENFRNMNTL INK\n",
-       "244           0.996128                 great lakes dredge & dock corp        great lakes dredge & dock company, llc               delaware               delaware          KRT LKS TRJ TK KRP        KRT LKS TRJ TK KMPN LK\n",
-       "251           0.573277                           blackstone group inc                        blackstone pb ii l.l.c               delaware               delaware              BLKSTN KRP INK                 BLKSTN PB LLK\n",
-       "252           0.573277                           blackstone group inc                         blackstone pb i l.l.c               delaware               delaware              BLKSTN KRP INK               BLKSTN PB I LLK\n",
-       "254           0.573277                               duke energy corp                          duke energy acp, llc               delaware               delaware                 TK ENRJ KRP                TK ENRJ AKP LK\n",
-       "255           0.573277                               duke energy corp                     duke energy shoreham, llc               delaware               delaware                 TK ENRJ KRP               TK ENRJ XRHM LK\n",
-       "256           0.573277                               duke energy corp                          duke energy sam, llc               delaware               delaware                 TK ENRJ KRP                 TK ENRJ SM LK\n",
-       "257           0.573277                           blackstone group inc                          blackstone obs l.l.c               delaware               delaware              BLKSTN KRP INK                BLKSTN OBS LLK\n",
-       "264           0.992184                       freightcar america, inc.               freightcar america leasing, llc               delaware                   None              FRTKR AMRK INK            FRTKR AMRK LSNK LK\n",
-       "265           0.992184                       freightcar america, inc.               freightcar america leasing, llc               delaware                   None              FRTKR AMRK INK            FRTKR AMRK LSNK LK\n",
-       "266           0.959568                            qurate retail, inc.                      qurate retail group, inc              englewood                     de                 KRT RTL INK               KRT RTL KRP INK\n",
-       "267           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK\n",
-       "268           0.884993                     green stream holdings inc.                    western gas wyoming, l.l.c                wyoming                wyoming         KRN STRM HLTNKS INK             WSTRN KS YMNK LLK"
+       "     match_probability                              company_name_l                                     company_name_r     loc_list_l              loc_list_r  company_name_mphone_l       company_name_mphone_r\n",
+       "465           0.914612                          conns incorporated                               invenco incorporated     [delaware]              [delaware]                    KNS                       INFNK\n",
+       "466           0.914612         vishay intertechnology incorporated                vishay precision foil, incorporated     [delaware]              [delaware]           FX INTRTXNLJ                 FX PRSXN FL\n",
+       "467           0.980607        vishay precision group, incorporated                vishay precision foil, incorporated     [delaware]              [delaware]           FX PRSXN KRP                 FX PRSXN FL\n",
+       "470           0.975104             jones lang lasalle incorporated                         jones lang lasalle limited     [maryland]            [hong, kong]            JNS LNK LSL                 JNS LNK LSL\n",
+       "471           0.951657                    nrg energy, incorporated                           nrg energy, incorporated     [delaware]              [delaware]               NRK ENRJ                    NRK ENRJ\n",
+       "472           0.914612                     firstenergy corporation                   firstenergy ventures corporation         [ohio]                  [ohio]                FRSTNRJ               FRSTNRJ FNTRS\n",
+       "478           0.914612     hudson pacific properties, incorporated              hudson pacific services, incorporated     [maryland]              [maryland]       HTSN PSFK PRPRTS             HTSN PSFK SRFSS\n",
+       "479           0.980607     hudson pacific properties, incorporated     hudson pacific properties, limited partnership     [maryland]              [maryland]       HTSN PSFK PRPRTS            HTSN PSFK PRPRTS\n",
+       "481           0.914612                  digital ally, incorporated           digital ally international, incorporated       [nevada]                [nevada]                TJTL AL            TJTL AL INTRNXNL\n",
+       "489           0.976947      cco holdings limited liability company           rhfw holdings, limited liability company            NaN              [delaware]              KK HLTNKS                  RHF HLTNKS\n",
+       "493           0.975104             intuitive surgical incorporated                         intuitive surgical limited     [delaware]       [united, kingdom]            INTTF SRJKL                 INTTF SRJKL\n",
+       "494           0.975104             jones lang lasalle incorporated                         jones lang lasalle limited     [maryland]               [england]            JNS LNK LSL                 JNS LNK LSL\n",
+       "500           0.975104                becton dickinson and company             becton, dickinson and company, limited  [new, jersey]               [ireland]         BKTN TKNSN ANT              BKTN TKNSN ANT\n",
+       "501           0.975104          united parcel service incorporated             united guaranty services, incorporated     [delaware]       [north, carolina]         UNTT PRSL SRFS             UNTT KRNT SRFSS\n",
+       "509           0.914612         estee lauder companies incorporated           estee lauder international, incorporated     [delaware]              [delaware]          EST LTR KMPNS            EST LTR INTRNXNL\n",
+       "510           0.914612                       maxcyte, incorporated                                 cues, incorporated     [delaware]              [delaware]                  MKSST                          KS\n",
+       "515           0.980607        zimmer biomet holdings, incorporated                  zimmer biomet spine, incorporated     [delaware]              [delaware]         SMR BMT HLTNKS                 SMR BMT SPN\n",
+       "518           0.914612               nordicus partners corporation                   nordco enterprises, incorporated     [delaware]  [wilmington, delaware]           NRTKS PRTNRS               NRTK ENTRPRSS\n",
+       "519           0.975104                       valero energy corp/tx                         valero energy incorporated     [delaware]                [canada]           FLR ENRJ TKS                    FLR ENRJ\n",
+       "527           0.914612                    nrg energy, incorporated                   nrg energy holdings incorporated     [delaware]              [delaware]               NRK ENRJ             NRK ENRJ HLTNKS\n",
+       "528           0.914612                 everi holdings incorporated                         edi holdings, incorporated     [delaware]              [delaware]             EFR HLTNKS                   ET HLTNKS\n",
+       "535           0.914612         estee lauder companies incorporated                          estee lauder incorporated     [delaware]              [delaware]          EST LTR KMPNS                     EST LTR\n",
+       "548           0.975104  universal logistics holdings, incorporated                    universal logistics corporation     [michigan]               [florida]   UNFRSL LJSTKS HLTNKS               UNFRSL LJSTKS\n",
+       "551           0.975104                  alliant energy corporation                            allergan gi corporation    [wisconsin]              [delaware]              ALNT ENRJ                     ALRKN J\n",
+       "555           0.975104                   smartmetric, incorporated                            smartpetro incorporated       [nevada]           [philippines]               SMRTMTRK                     SMRTPTR\n",
+       "566           0.914612             republic services, incorporated                     republic conduit, incorporated     [delaware]              [delaware]            RPBLK SRFSS                  RPBLK KNTT\n",
+       "571           0.975104              freedom holdings, incorporated                      freedom designs, incorporated     [maryland]            [california]            FRTM HLTNKS                  FRTM TSKNS\n",
+       "573           0.938457  ares real estate income trust incorporated         ares real estate income trust incorporated     [maryland]              [delaware]  ARS RL ESTT INKM TRST       ARS RL ESTT INKM TRST\n",
+       "574           0.975104         bank of new york mellon corporation                      bank of new york mellon sa/nv     [delaware]               [belgium]       BNK OF N YRK MLN        BNK OF N YRK MLN SNF\n",
+       "576           0.914612                            southern company                     southern wood piedmont company     [delaware]              [delaware]                   S0RN               S0RN WT PTMNT\n",
+       "582           0.914612                      ameresco, incorporated                            ameripath, incorporated     [delaware]              [delaware]                  AMRSK                       AMRP0\n",
+       "584           0.914612                        trevena incorporated                                  anr, incorporated     [delaware]              [delaware]                   TRFN                         ANR\n",
+       "590           0.975104         bank of new york mellon corporation                            bank of new york mellon     [delaware]             [new, york]       BNK OF N YRK MLN            BNK OF N YRK MLN\n",
+       "591           0.938457                  xerox holdings corporation                         xerox holdings corporation  [connecticut]             [new, york]            SRKS HLTNKS                 SRKS HLTNKS\n",
+       "594           0.975104             jones lang lasalle incorporated                jones lang lasalle ip, incorporated     [maryland]              [delaware]            JNS LNK LSL              JNS LNK LSL IP\n",
+       "595           0.914612                  iron mountain incorporated        iron mountain global holdings, incorporated     [delaware]              [delaware]               IRN MNTN        IRN MNTN KLBL HLTNKS\n",
+       "597           0.980607               extreme networks incorporated                 extreme networks ihc, incorporated     [delaware]              [delaware]          EKSTRM NTWRKS            EKSTRM NTWRKS IK\n",
+       "599           0.976947                   q2 holdings, incorporated                          vr holdings, incorporated            NaN              [colorado]               K HLTNKS                   FR HLTNKS\n",
+       "600           0.980607               extreme networks incorporated                     extreme networks, incorporated     [delaware]              [delaware]          EKSTRM NTWRKS               EKSTRM NTWRKS\n",
+       "604           0.914612                         cutera incorporated                                 vrec, incorporated     [delaware]              [delaware]                    KTR                         FRK\n",
+       "605           0.975104                    assured guaranty limited                  assured guaranty services limited           [d0]               [england]              ASRT KRNT             ASRT KRNT SRFSS\n",
+       "606           0.976947                        virtra, incorporated                               viator, incorporated       [nevada]                     NaN                   FRTR                         FTR\n",
+       "618           0.975104   sculptor capital management, incorporated      sculptor capital management hong kong limited     [delaware]            [hong, kong]     SKLPTR KPTL MNJMNT  SKLPTR KPTL MNJMNT HNK KNK\n",
+       "625           0.975104                        enstar group limited                                     enstar limited           [d0]               [bermuda]              ENSTR KRP                       ENSTR\n",
+       "626           0.975104    sellas life sciences group, incorporated                 sellas life sciences group limited     [delaware]               [bermuda]       SLS LF SSNSS KRP            SLS LF SSNSS KRP\n",
+       "627           0.975104             intuitive surgical incorporated             intuitive surgical canada incorporated     [delaware]                [canada]            INTTF SRJKL             INTTF SRJKL KNT\n",
+       "630           0.951657                 forestar group incorporated                        forestar group incorporated     [delaware]              [delaware]              FRSTR KRP                   FRSTR KRP\n",
+       "637           0.914612          dcp midstream, limited partnership       dcp midstream operating, limited partnership     [delaware]              [delaware]             TKP MTSTRM           TKP MTSTRM OPRTNK\n",
+       "639           0.951657            equitable holdings, incorporated                   equitable holdings, incorporated     [delaware]              [delaware]           EKTBL HLTNKS                EKTBL HLTNKS\n",
+       "643           0.914612         energy transfer limited partnership  energy transfer partners, limited liability co...     [delaware]              [delaware]            ENRJ TRNSFR          ENRJ TRNSFR PRTNRS"
       ]
      },
-     "execution_count": 249,
+     "execution_count": 79,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preds_df[preds_df.match_probability >= .5][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_of_incorporation_l\", \"loc_of_incorporation_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]"
+    "preds_df[preds_df.match_probability >= .9][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_list_l\", \"loc_list_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb2122d8-ff0a-4117-a91c-17a0523dcfcb",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 111ae84..0d74c13 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 262,
+   "execution_count": 2,
    "id": "1107fe42-197c-4fea-9c48-06d08699af0b",
    "metadata": {},
    "outputs": [],
@@ -23,13 +23,16 @@
     "from pathlib import Path\n",
     "\n",
     "import pandas as pd\n",
+    "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n",
     "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n",
     "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n",
     "import splink.comparison_library as cl\n",
+    "import splink.comparison_level_library as cll\n",
     "from splink.exploratory import completeness_chart, profile_columns\n",
     "from upath import UPath\n",
     "\n",
-    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df"
+    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n",
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import add_sec_company_id_to_subsidiaries, prepare_sec10k_basic_info_df, prepare_eia_df, prepare_ex21_df"
    ]
   },
   {
@@ -53,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 3,
    "id": "4ab5594d-7d1f-425d-80e1-92c30be73011",
    "metadata": {
     "tags": []
@@ -65,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 4,
    "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44",
    "metadata": {
     "tags": []
@@ -77,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab",
    "metadata": {
     "tags": []
@@ -89,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 6,
    "id": "3fb7895f-10c5-4450-96f9-77b36471b53e",
    "metadata": {
     "tags": []
@@ -101,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 7,
    "id": "06c76b82-1aad-47b2-aecc-6225a286cc40",
    "metadata": {
     "tags": []
@@ -118,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 8,
    "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4",
    "metadata": {
     "tags": []
@@ -130,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "id": "3b7484de-bbc7-47ba-b408-a1af1183018c",
    "metadata": {
     "tags": []
@@ -149,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 10,
    "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109",
    "metadata": {
     "tags": []
@@ -162,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 11,
    "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4",
    "metadata": {},
    "outputs": [],
@@ -185,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 12,
    "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7",
    "metadata": {},
    "outputs": [],
@@ -195,7 +198,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 13,
    "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821",
    "metadata": {},
    "outputs": [],
@@ -208,140 +211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "23da5ca1-bd04-44d4-b252-7b114d6d553f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th>value</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>filename</th>\n",
-       "      <th>filer_count</th>\n",
-       "      <th>block</th>\n",
-       "      <th>block_count</th>\n",
-       "      <th>key</th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">edgar/data/100240/0000950144-94-000787.txt</th>\n",
-       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
-       "      <th rowspan=\"5\" valign=\"top\">company_data</th>\n",
-       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
-       "      <th>company_conformed_name</th>\n",
-       "      <td>turner broadcasting system inc</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>central_index_key</th>\n",
-       "      <td>0000100240</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>standard_industrial_classification</th>\n",
-       "      <td>4833</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>irs_number</th>\n",
-       "      <td>580950695</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>state_of_incorporation</th>\n",
-       "      <td>ga</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"5\" valign=\"top\">edgar/data/936528/0000936528-23-000207.txt</th>\n",
-       "      <th rowspan=\"5\" valign=\"top\">0</th>\n",
-       "      <th rowspan=\"5\" valign=\"top\">former_company</th>\n",
-       "      <th>0</th>\n",
-       "      <th>date_of_name_change</th>\n",
-       "      <td>20230928</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"2\" valign=\"top\">1</th>\n",
-       "      <th>former_conformed_name</th>\n",
-       "      <td>wafd inc</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>date_of_name_change</th>\n",
-       "      <td>20230927</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th rowspan=\"2\" valign=\"top\">2</th>\n",
-       "      <th>former_conformed_name</th>\n",
-       "      <td>washington federal inc</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>date_of_name_change</th>\n",
-       "      <td>19950206</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>7980908 rows × 1 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                                                                                                               value\n",
-       "filename                                   filer_count block          block_count key                                                               \n",
-       "edgar/data/100240/0000950144-94-000787.txt 0           company_data   0           company_conformed_name              turner broadcasting system inc\n",
-       "                                                                                  central_index_key                                       0000100240\n",
-       "                                                                                  standard_industrial_classification                            4833\n",
-       "                                                                                  irs_number                                               580950695\n",
-       "                                                                                  state_of_incorporation                                          ga\n",
-       "...                                                                                                                                              ...\n",
-       "edgar/data/936528/0000936528-23-000207.txt 0           former_company 0           date_of_name_change                                       20230928\n",
-       "                                                                      1           former_conformed_name                                     wafd inc\n",
-       "                                                                                  date_of_name_change                                       20230927\n",
-       "                                                                      2           former_conformed_name                       washington federal inc\n",
-       "                                                                                  date_of_name_change                                       19950206\n",
-       "\n",
-       "[7980908 rows x 1 columns]"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "raw_sec_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": null,
    "id": "1be3364e-9887-42b2-b303-0a24e8681acf",
    "metadata": {
     "tags": []
@@ -352,6 +222,16 @@
     "raw_sec_df.columns.name = None"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2",
@@ -362,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 17,
    "id": "611da616-45ef-40ae-bc06-8bfbc871274d",
    "metadata": {},
    "outputs": [],
@@ -372,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 18,
    "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b",
    "metadata": {},
    "outputs": [],
@@ -390,25 +270,16 @@
   {
    "cell_type": "markdown",
    "id": "b636d438-ed71-426c-8c2a-9e550fe99958",
-   "metadata": {},
-   "source": [
-    "# Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 153,
-   "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522",
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "tags": []
+   },
    "source": [
-    "# cleaning on both sides\n",
-    "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+    "# Preprocess Ex. 21"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
+   "execution_count": 19,
    "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e",
    "metadata": {},
    "outputs": [
@@ -416,7 +287,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:189: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:168: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
       "  df = df.fillna(np.nan)\n"
      ]
     }
@@ -427,493 +298,638 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 224,
-   "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_clean_df = prepare_eia_df(eia_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 228,
-   "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SHARED_COLS = [\n",
-    "    \"record_id\",\n",
-    "    \"report_date\",\n",
-    "    \"report_year\",\n",
-    "    \"company_name\",\n",
-    "    \"street_address\",\n",
-    "    \"street_address_2\",\n",
-    "    \"city\",\n",
-    "    \"state\",  # could use state of incorporation from SEC\n",
-    "    \"zip_code\",\n",
-    "    \"phone_number\",\n",
-    "    \"company_name_mphone\"\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e90de0d3-3220-4869-80a3-fc7dd381d393",
+   "execution_count": 34,
+   "id": "027191c4-82fa-491b-8c73-54551c7fa4e6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# strip legal terms and then make a list column from company name\n",
-    "# use this for blocking and comnparison levels\n",
-    "eia_match_df[\"company_name_mphone_list\"] = eia_match_df[\"company_name_mphone\"].str.split()"
+    "sec_match_df = sec_clean_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\", \"report_year\"])\n",
+    "merged_df = sec_match_df.merge(ex21_clean_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))\n",
+    "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n",
+    "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n",
+    "merged_df[\"loc_overlap\"] = merged_df.apply(\n",
+    "    lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n",
+    ")\n",
+    "merged_df[\"report_year_diff\"] = merged_df.apply(\n",
+    "    lambda row: abs(int(row[\"report_year_sec\"]) - int(row[\"report_year_ex21\"])), axis=1\n",
+    ")\n",
+    "# Sort by CIK, company_name, loc_overlap, and report_year_diff\n",
+    "# so that we can then choose the first record in each CIK, company_name group\n",
+    "merged_df = merged_df.sort_values(by=[\"central_index_key\", \"company_name\", \"loc_overlap\", \"report_year_diff\"],\n",
+    "                                  ascending=[True, True, False, True]\n",
+    "                                 )\n",
+    "# Select the row with the highest loc overlap and nearest report years for each CIK and company name\n",
+    "cik_and_company_pairs = merged_df.groupby([\"central_index_key\", \"company_name\"], as_index=False).first()\n",
+    "# We now have the closest matching CIK and company name pairs\n",
+    "# We want to get the best matching CIK for each company name and loc of incorporation\n",
+    "# Select the row with the highest loc overlap and nearest report years for each company name and loc pair\n",
+    "cik_and_company_pairs = cik_and_company_pairs.sort_values(by=[\"company_name\", \"loc_of_incorporation_ex21\", \"loc_overlap\", \"report_year_diff\"],\n",
+    "                                                          ascending=[True, True, False, True]\n",
+    "                                                         )\n",
+    "closest_match = cik_and_company_pairs.groupby([\"company_name\", \"loc_of_incorporation_ex21\"], as_index=False).first()\n",
+    "closest_match = closest_match.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation_ex21\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "460c5bd5-f2e2-45c3-86c3-ac203bd053d0",
+   "execution_count": 35,
+   "id": "bd9e9f44-7ff8-4615-a5c3-ee8f32439e26",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False    5808\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# create list column for address information as well"
+    "# a company name and location of incorporation should match to only one CIK\n",
+    "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
-   "id": "c3bdc160-1939-4f34-914f-ecb0b5fdb5ac",
+   "execution_count": 36,
+   "id": "64572f77-0a64-48a9-83fd-1c0179202010",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>street_address_2</th>\n",
-       "      <th>city</th>\n",
-       "      <th>state</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>phone_number</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2000-03-30</td>\n",
-       "      <td>2000</td>\n",
-       "      <td>meta group incorporated</td>\n",
-       "      <td>208 harbor dr</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>06912-0061</td>\n",
-       "      <td>2039736700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2001-04-02</td>\n",
-       "      <td>2001</td>\n",
-       "      <td>meta group incorporated</td>\n",
-       "      <td>208 harbor dr</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>06912-0061</td>\n",
-       "      <td>2039736700</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2002-04-01</td>\n",
-       "      <td>2002</td>\n",
-       "      <td>meta group incorporated</td>\n",
-       "      <td>208 harbor dr</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>06912-0061</td>\n",
-       "      <td>2039736700</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "  report_date  report_year             company_name street_address street_address_2      city state    zip_code phone_number\n",
-       "0  2000-03-30         2000  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700\n",
-       "1  2001-04-02         2001  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700\n",
-       "2  2002-04-01         2002  meta group incorporated  208 harbor dr              NaN  stamford    ct  06912-0061   2039736700"
+       "central_index_key\n",
+       "False    5532\n",
+       "True      276\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 158,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sec_clean_df[SHARED_COLS].head(3)"
+    "# it's okay if there's duplication here\n",
+    "# multiple subsidiaries can point to the same CIK\n",
+    "# and company names can change and they still keep the same CIK\n",
+    "closest_match.central_index_key.duplicated().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "a669e0b7-c7fb-4c12-9121-0282e616286a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = ex21_clean_df.merge(\n",
+    "    closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n",
+    "    how=\"left\",\n",
+    "    on=[\"company_name\", \"loc_of_incorporation\"],\n",
+    ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
-   "id": "9d73fdac-8d97-4030-9772-79ac058b0d33",
+   "execution_count": 38,
+   "id": "245697ec-9451-47e7-953b-eba65062ee93",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>street_address_2</th>\n",
-       "      <th>city</th>\n",
-       "      <th>state</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>phone_number</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>33</th>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>desert willow energy storage</td>\n",
-       "      <td>100 bayview circle</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>newport beach</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>portage solar plant</td>\n",
-       "      <td>n8917</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>portage</td>\n",
-       "      <td>wi</td>\n",
-       "      <td>53901</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37</th>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>nsf energy one limited liability company</td>\n",
-       "      <td>1241 university ave</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>rochester</td>\n",
-       "      <td>ny</td>\n",
-       "      <td>14607</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "   report_date  report_year                              company_name       street_address street_address_2           city state zip_code phone_number\n",
-       "33  2023-01-01         2023              desert willow energy storage   100 bayview circle              NaN  newport beach    ca      NaN          NaN\n",
-       "35  2023-01-01         2023                       portage solar plant                n8917              NaN        portage    wi    53901          NaN\n",
-       "37  2023-01-01         2023  nsf energy one limited liability company  1241 university ave              NaN      rochester    ny    14607          NaN"
+       "subsidiary_cik\n",
+       "True     2900030\n",
+       "False      21674\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 159,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "eia_clean_df[~eia_match_df.street_address.isnull()][SHARED_COLS].head(3)"
+    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
-   "id": "db2b1e13-824e-4c86-8065-fc99e9a1186c",
+   "execution_count": 39,
+   "id": "1382a2e4-e88e-47bb-93ed-dafc576ec2f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n",
+    "                                    how=\"left\",\n",
+    "                                    on=\"company_name\"\n",
+    "                                   ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "5f70e3ff-2494-4eda-bfa2-6989bcf442bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# if a subsidiary doesn't have a CIK and has a null location\n",
+    "# but its company name was assigned a CIK (with a different location)\n",
+    "# then assign that CIK to the subsidiary\n",
+    "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n",
+    "    ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n",
+    "    ex21_with_cik[\"company_name_merge_cik\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "63d4cc13-a4bf-4473-99bb-6d8fcf9a1174",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "subsidiary_cik\n",
+       "True     2897527\n",
+       "False      24221\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# there should be fewer null CIKs now\n",
+    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "e25cf09f-8bbd-4dcd-b308-71bc5a357bf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "archive = GCSArchive()\n",
+    "md = archive.get_metadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "d17ed466-74d6-44e5-aaca-8dc6793712d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "6303051b-74bf-4043-885e-aaaf6593852d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"],\n",
+    "                                    how=\"left\",\n",
+    "                                    left_on=\"filename\",\n",
+    "                                    right_index=True).rename(columns={\"cik\": \"parent_cik\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "da72f2d4-54a8-487a-82ec-92d9e8df091f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "eff49691-d17c-4a55-817d-8eeaf83900e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# remove the Ex. 21 subsidiaries who were matched to a filing company\n",
+    "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",
+   "metadata": {},
+   "source": [
+    "# Preprocess SEC and EIA\n",
+    "\n",
+    "Does it actually make sense to add in the Ex. 21 subsidiaries when we only have company name?\n",
+    "Does it make more sense to do a direct match on company name after\n",
+    "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "8453d55d-a3ac-422d-9cef-e7f13d582efe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find a way to use state of incorporation even though it's not on the EIA side?\n",
+    "sec_full_clean_df = pd.concat([sec_clean_df, \n",
+    "                               unmatched_ex21_df[[\"sec_company_id\", \"report_year\", \"company_name\", \"company_name_no_legal\", \"company_name_mphone\", \"state_of_incorporation\"]]\n",
+    "                              ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "2bc79d7d-b756-47d5-a61d-a3a761160250",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_full_clean_df = sec_full_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "166d3c96-93d6-4a22-afbf-8d94dc9ecfb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for now, just use sec_clean_df without Ex. 21 subsidiaries\n",
+    "sec_clean_df = sec_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_clean_df = prepare_eia_df(eia_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "e754b2ef-5a0d-4582-8694-047528dfd339",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>record_id</th>\n",
-       "      <th>id</th>\n",
-       "      <th>company_name_raw</th>\n",
-       "      <th>loc_of_incorporation</th>\n",
-       "      <th>own_per</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>company_name_mphone</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>14060-0000916131-94-000015</td>\n",
-       "      <td>brenton bank and trust company</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1994</td>\n",
-       "      <td>brenton bank and trust company</td>\n",
-       "      <td>BRNTN BNK ANT TRST KMPN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>14060-0000916131-94-000015</td>\n",
-       "      <td>adel</td>\n",
-       "      <td>iowa</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1994</td>\n",
-       "      <td>adel</td>\n",
-       "      <td>ATL</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>14060-0000916131-94-000015</td>\n",
-       "      <td>brenton savings bank, fsb united states</td>\n",
-       "      <td>ames, iowa</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1994</td>\n",
-       "      <td>brenton savings bank, fsb united states</td>\n",
-       "      <td>BRNTN SFNKS BNK FSB UNTT STTS</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "   record_id                          id                         company_name_raw loc_of_incorporation own_per  report_year                             company_name            company_name_mphone\n",
-       "0          0  14060-0000916131-94-000015           brenton bank and trust company                 iowa     NaN         1994           brenton bank and trust company        BRNTN BNK ANT TRST KMPN\n",
-       "1          1  14060-0000916131-94-000015                                     adel                 iowa     NaN         1994                                     adel                            ATL\n",
-       "2          2  14060-0000916131-94-000015  brenton savings bank, fsb united states           ames, iowa     NaN         1994  brenton savings bank, fsb united states  BRNTN SFNKS BNK FSB UNTT STTS"
+       "True"
       ]
      },
-     "execution_count": 160,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "ex21_clean_df.head(3)"
+    "sec_clean_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 229,
-   "id": "4ea7c80a-5b5b-4a07-bca0-b6ed1e78dce9",
+   "execution_count": 28,
+   "id": "38ad3504-2cde-455f-8896-6a435677541c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['record_id',\n",
-       " 'report_date',\n",
-       " 'report_year',\n",
-       " 'company_name',\n",
-       " 'street_address',\n",
-       " 'street_address_2',\n",
-       " 'city',\n",
-       " 'state',\n",
-       " 'zip_code',\n",
-       " 'phone_number',\n",
-       " 'company_name_mphone']"
+       "True"
       ]
      },
-     "execution_count": 229,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "SHARED_COLS"
+    "eia_clean_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 231,
-   "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
+   "execution_count": 30,
+   "id": "e90de0d3-3220-4869-80a3-fc7dd381d393",
    "metadata": {},
    "outputs": [],
    "source": [
-    "eia_match_df = eia_clean_df[SHARED_COLS]"
+    "# TODO: move this into preprocessing\n",
+    "# strip legal terms and then make a list column from company name\n",
+    "# use this for blocking and comnparison levels\n",
+    "eia_clean_df.loc[:, \"company_name_mphone_list\"] = eia_clean_df[\"company_name_mphone\"].str.split()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 232,
-   "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b",
+   "execution_count": 31,
+   "id": "b71a24f2-51b5-444f-a645-054cc3e25cf8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sec_match_df = sec_clean_df[SHARED_COLS]"
+    "sec_clean_df.loc[:, \"company_name_mphone_list\"] = sec_clean_df[\"company_name_mphone\"].str.split()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "eb9c00dc-50a5-49cc-9589-0bf4df917ab3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_clean_df.loc[:, \"zip_code\"] = eia_clean_df[\"zip_code\"].str[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "edead864-7004-4081-ab78-313c14ff81a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_clean_df.loc[:, \"zip_code\"] = sec_clean_df[\"zip_code\"].str[:5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "a5af13b2-9d43-42e6-9477-1fb7d52412cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I think we don't need this column\n",
+    "eia_clean_df.loc[:, \"street_address_list\"] = eia_clean_df[\"street_address\"].str.split()\n",
+    "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SHARED_COLS = [\n",
+    "    \"record_id\",\n",
+    "    \"report_date\",\n",
+    "    \"report_year\",\n",
+    "    \"company_name\",\n",
+    "    \"company_name_no_legal\",\n",
+    "    \"street_address\",\n",
+    "    \"street_address_list\",\n",
+    "    \"street_address_2\",\n",
+    "    \"city\",\n",
+    "    \"state\",  # could use state of incorporation from SEC\n",
+    "    \"zip_code\",\n",
+    "    \"phone_number\",\n",
+    "    \"company_name_mphone\",\n",
+    "    \"company_name_mphone_list\"\n",
+    "]"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9a04c196-e926-4502-82ee-c27352352591",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
+   "id": "21b697b0-7d9e-452c-9b8b-ee40fd6bb7bd",
+   "metadata": {},
    "source": [
-    "# Link in Ex. 21 records"
+    "create list column for address information as well?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
-   "id": "c1500344-ff7f-450e-90dd-1105d8e7c637",
+   "execution_count": 55,
+   "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# run the Ex.21 to SEC model\n",
-    "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n",
-    "with open(filepath, 'r') as file:\n",
-    "    sec_ex21_settings = json.load(file)"
+    "eia_match_df = eia_clean_df[SHARED_COLS]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 192,
-   "id": "172ea84f-a0b7-4e9c-b746-322a47663171",
+   "execution_count": 38,
+   "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+    "sec_match_df = sec_clean_df[SHARED_COLS]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 193,
-   "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9",
+   "execution_count": 43,
+   "id": "a4a15b86-71cf-4d8d-9c09-f82a70f10273",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+    "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 194,
-   "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937",
+   "execution_count": 49,
+   "id": "842fa02e-5202-445c-b728-72bce42e740d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "14125"
+       "True     138441\n",
+       "False     39407\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 194,
+     "execution_count": 49,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "len(sec_test_df)"
+    "# duplicates exist because of differing report years\n",
+    "eia_match_df.duplicated(subset=match_cols).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 195,
-   "id": "ec13db12-3664-4e00-aa83-7c372039b230",
+   "execution_count": 52,
+   "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "233101"
+       "True     168445\n",
+       "False     64515\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 195,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "len(ex21_test_df)"
+    "sec_match_df.duplicated(subset=match_cols).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 196,
-   "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522",
+   "execution_count": 56,
+   "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
+      "text/plain": [
+       "standard_industrial_classification\n",
+       "asset-backed securities [6189]          20311\n",
+       "pharmaceutical preparations [2834]       8530\n",
+       "state commercial banks [6022]            7886\n",
+       "real estate investment trusts [6798]     7706\n",
+       "services-prepackaged software [7372]     6007\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# could try to use keywords like gas, electricity, utility etc.\n",
+    "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 165,
+   "id": "c1500344-ff7f-450e-90dd-1105d8e7c637",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# run the Ex.21 to SEC model\n",
+    "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n",
+    "with open(filepath, 'r') as file:\n",
+    "    sec_ex21_settings = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 192,
+   "id": "172ea84f-a0b7-4e9c-b746-322a47663171",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 193,
+   "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 194,
+   "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14125"
+      ]
+     },
+     "execution_count": 194,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sec_test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 195,
+   "id": "ec13db12-3664-4e00-aa83-7c372039b230",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "233101"
+      ]
+     },
+     "execution_count": 195,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(ex21_test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 196,
+   "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
        "    }\n",
        "\n",
        "    .dataframe thead th {\n",
@@ -1531,7 +1547,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 205,
+   "execution_count": 128,
    "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
    "metadata": {},
    "outputs": [],
@@ -1541,7 +1557,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 233,
+   "execution_count": 129,
    "id": "ac4e560b-6946-4cc7-b2bc-6d5f4b154da6",
    "metadata": {},
    "outputs": [
@@ -1550,23 +1566,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed {\n",
+       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed details,\n",
-       "  #altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215.vega-embed details summary {\n",
+       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed details,\n",
+       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\"></div>\n",
+       "<div id=\"altair-viz-568ae8a9a7b0476a9476900de3419267\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-6c1f9f2c0b1d402d8bfc2877d4b72215\");\n",
+       "    if (outputDiv.id !== \"altair-viz-568ae8a9a7b0476a9476900de3419267\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-568ae8a9a7b0476a9476900de3419267\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1612,25 +1628,26 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-fb2bb6472d120ab63768ece05202f6ba\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-fb2bb6472d120ab63768ece05202f6ba\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 230320, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 4, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9999826550483704}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 272, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9988190531730652}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 126403, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.45118531584739685}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 107, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9995354413986206}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 206, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9991055727005005}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 619, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9973124265670776}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 6194, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9731069803237915}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 4, \"total_rows_inc_nulls\": 230320, \"completeness\": 0.9999826550483704}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-a6b030dc7069d2f4600013c4a9b5bad7\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-a6b030dc7069d2f4600013c4a9b5bad7\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 96, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9985119700431824}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_list\", \"total_null_rows\": 96, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9985119700431824}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 34486, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.46545764803886414}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 69, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9989304542541504}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 109, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9983104467391968}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 274, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9957529306411743}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 2914, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9548321962356567}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone_list\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 233,
+     "execution_count": 129,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# this goes way down when we start matching in the Ex. 21 subsidiaries\n",
     "completeness_chart(sec_match_df, db_api=db_api)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 234,
+   "execution_count": 130,
    "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b",
    "metadata": {},
    "outputs": [
@@ -1639,23 +1656,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed {\n",
+       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed details,\n",
-       "  #altair-viz-b9db261bb6eb4d978d2b694dc2c37711.vega-embed details summary {\n",
+       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed details,\n",
+       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\"></div>\n",
+       "<div id=\"altair-viz-278a5be917034b29a93d18bbbb0a987c\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-b9db261bb6eb4d978d2b694dc2c37711\");\n",
+       "    if (outputDiv.id !== \"altair-viz-278a5be917034b29a93d18bbbb0a987c\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-278a5be917034b29a93d18bbbb0a987c\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1701,14 +1718,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-3736beb63dd0913fa1793471df7936c9\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-3736beb63dd0913fa1793471df7936c9\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 70684, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.599219799041748}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 142421, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.19246907532215118}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 47174, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.7325221300125122}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 19847, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.8874669671058655}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 48235, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.726506233215332}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 164751, \"total_rows_inc_nulls\": 176366, \"completeness\": 0.06585736572742462}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 176366, \"completeness\": 1.0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-d98152b9bd4690e94d5eb2c5ee1c5ff9\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-d98152b9bd4690e94d5eb2c5ee1c5ff9\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 19556, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.5037429928779602}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_list\", \"total_null_rows\": 19556, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.5037429928779602}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 33097, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.16012383997440338}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 14129, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.6414596438407898}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 9299, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.7640267014503479}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 14454, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.6332123875617981}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 39039, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.009338442236185074}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone_list\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 234,
+     "execution_count": 130,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1719,18 +1736,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 209,
-   "id": "c4542c1f-d826-43c1-9af5-ce6473b79d90",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# could sub in zip code for street address?\n",
-    "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 210,
+   "execution_count": 131,
    "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
    "metadata": {},
    "outputs": [
@@ -1739,23 +1745,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed {\n",
+       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed details,\n",
-       "  #altair-viz-d91c69c848924e72ba734dbe839979d9.vega-embed details summary {\n",
+       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed details,\n",
+       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-d91c69c848924e72ba734dbe839979d9\"></div>\n",
+       "<div id=\"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-d91c69c848924e72ba734dbe839979d9\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-d91c69c848924e72ba734dbe839979d9\");\n",
+       "    if (outputDiv.id !== \"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1801,14 +1807,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9821549654006958, \"percentile_inc_nulls\": 0.9821552634239197, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4110.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9618828296661377, \"percentile_inc_nulls\": 0.9618834853172302, \"value_count\": 29, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4669.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9368389844894409, \"percentile_inc_nulls\": 0.9368400573730469, \"value_count\": 28, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5768.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.9135101437568665, \"percentile_inc_nulls\": 0.9135116338729858, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5373.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.899511992931366, \"percentile_inc_nulls\": 0.8995137214660645, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3224.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8856180310249329, \"percentile_inc_nulls\": 0.8856199979782104, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3200.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8689452409744263, \"percentile_inc_nulls\": 0.8689475655555725, \"value_count\": 24, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3840.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8524678945541382, \"percentile_inc_nulls\": 0.8524704575538635, \"value_count\": 23, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3795.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8382352590560913, \"percentile_inc_nulls\": 0.8382381200790405, \"value_count\": 22, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3278.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8196347951889038, \"percentile_inc_nulls\": 0.8196378946304321, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4284.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.8045250773429871, \"percentile_inc_nulls\": 0.8045284748077393, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3480.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7848911881446838, \"percentile_inc_nulls\": 0.7848949432373047, \"value_count\": 19, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4522.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.761445164680481, \"percentile_inc_nulls\": 0.7614492774009705, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5400.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7384159564971924, \"percentile_inc_nulls\": 0.7384204864501953, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5304.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.7129899859428406, \"percentile_inc_nulls\": 0.7129949331283569, \"value_count\": 16, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5856.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.6855711340904236, \"percentile_inc_nulls\": 0.6855765581130981, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6315.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.659433126449585, \"percentile_inc_nulls\": 0.6594390273094177, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6020.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.630138635635376, \"percentile_inc_nulls\": 0.6301450133323669, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6747.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5980435609817505, \"percentile_inc_nulls\": 0.5980505347251892, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7392.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5616500377655029, \"percentile_inc_nulls\": 0.5616576671600342, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8382.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.5209668874740601, \"percentile_inc_nulls\": 0.5209751725196838, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9370.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.473879337310791, \"percentile_inc_nulls\": 0.4738885164260864, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10845.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.4248337149620056, \"percentile_inc_nulls\": 0.4248436689376831, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11296.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.3734694719314575, \"percentile_inc_nulls\": 0.3734803795814514, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11830.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.3194654583930969, \"percentile_inc_nulls\": 0.31947726011276245, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 12438.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.2599167823791504, \"percentile_inc_nulls\": 0.2599296569824219, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 13715.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.19405949115753174, \"percentile_inc_nulls\": 0.19407343864440918, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 15168.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.13083326816558838, \"percentile_inc_nulls\": 0.13084840774536133, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14562.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.0675550103187561, \"percentile_inc_nulls\": 0.06757122278213501, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14574.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 1.7344951629638672e-05, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 15559.0, \"distinct_value_count\": 47323}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4110.0, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 4 values (0.0%) are null and there are 47323 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"sherwin williams company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"simmons first national corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"smith a o corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"unifi incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"universal corp /va/\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"vulcan materials company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"boeing company\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"rayonier incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"wesbanco incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 30, \"group_name\": \"_company_name_\", \"value\": \"deere john capital corporation\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"ambers stores incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"nx networks incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"hwcc tunica incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"james maritime holdings incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"sportmart incorporated\", \"total_non_null_rows\": 230316, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 47323}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 30]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8475755453109741, \"percentile_inc_nulls\": 0.8477118611335754, \"value_count\": 35075, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35075.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.7260271310806274, \"percentile_inc_nulls\": 0.7262721061706543, \"value_count\": 27970, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 27970.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6417819261550903, \"percentile_inc_nulls\": 0.6421023011207581, \"value_count\": 19386, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19386.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5964608788490295, \"percentile_inc_nulls\": 0.5968217849731445, \"value_count\": 10429, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10429.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5520437955856323, \"percentile_inc_nulls\": 0.5524444580078125, \"value_count\": 10221, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10221.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5123851299285889, \"percentile_inc_nulls\": 0.5128213167190552, \"value_count\": 9126, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9126.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4752122759819031, \"percentile_inc_nulls\": 0.4756816625595093, \"value_count\": 8554, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8554.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.43829578161239624, \"percentile_inc_nulls\": 0.438798189163208, \"value_count\": 8495, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8495.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.41224348545074463, \"percentile_inc_nulls\": 0.4127691984176636, \"value_count\": 5995, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5995.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3880468010902405, \"percentile_inc_nulls\": 0.3885941505432129, \"value_count\": 5568, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5568.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3645975589752197, \"percentile_inc_nulls\": 0.3651658296585083, \"value_count\": 5396, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5396.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3423563838005066, \"percentile_inc_nulls\": 0.3429446220397949, \"value_count\": 5118, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5118.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.32081925868988037, \"percentile_inc_nulls\": 0.3214266896247864, \"value_count\": 4956, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4956.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2995384931564331, \"percentile_inc_nulls\": 0.30016499757766724, \"value_count\": 4897, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4897.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2785836458206177, \"percentile_inc_nulls\": 0.27922892570495605, \"value_count\": 4822, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4822.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2577331066131592, \"percentile_inc_nulls\": 0.2583969831466675, \"value_count\": 4798, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4798.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.23978984355926514, \"percentile_inc_nulls\": 0.24046975374221802, \"value_count\": 4129, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4129.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.22265487909317017, \"percentile_inc_nulls\": 0.22335010766983032, \"value_count\": 3943, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3943.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.20564591884613037, \"percentile_inc_nulls\": 0.20635634660720825, \"value_count\": 3914, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3914.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1933215856552124, \"percentile_inc_nulls\": 0.1940430998802185, \"value_count\": 2836, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2836.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.18145787715911865, \"percentile_inc_nulls\": 0.18219000101089478, \"value_count\": 2730, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2730.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.16995924711227417, \"percentile_inc_nulls\": 0.17070162296295166, \"value_count\": 2646, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2646.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1585344672203064, \"percentile_inc_nulls\": 0.15928709506988525, \"value_count\": 2629, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2629.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1479397416114807, \"percentile_inc_nulls\": 0.14870178699493408, \"value_count\": 2438, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2438.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.13760137557983398, \"percentile_inc_nulls\": 0.13837271928787231, \"value_count\": 2379, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2379.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12890130281448364, \"percentile_inc_nulls\": 0.12968045473098755, \"value_count\": 2002, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2002.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12041854858398438, \"percentile_inc_nulls\": 0.12120527029037476, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.11287450790405273, \"percentile_inc_nulls\": 0.11366796493530273, \"value_count\": 1736, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1736.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.10543036460876465, \"percentile_inc_nulls\": 0.10623043775558472, \"value_count\": 1713, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1713.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09882056713104248, \"percentile_inc_nulls\": 0.09962660074234009, \"value_count\": 1521, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1521.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09293222427368164, \"percentile_inc_nulls\": 0.09374350309371948, \"value_count\": 1355, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1355.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08733934164047241, \"percentile_inc_nulls\": 0.08815562725067139, \"value_count\": 1287, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1287.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08176815509796143, \"percentile_inc_nulls\": 0.08258944749832153, \"value_count\": 1282, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1282.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07628393173217773, \"percentile_inc_nulls\": 0.07711011171340942, \"value_count\": 1262, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1262.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07103002071380615, \"percentile_inc_nulls\": 0.0718609094619751, \"value_count\": 1209, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1209.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06596297025680542, \"percentile_inc_nulls\": 0.0667983889579773, \"value_count\": 1166, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1166.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06157815456390381, \"percentile_inc_nulls\": 0.06241750717163086, \"value_count\": 1009, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1009.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05761057138442993, \"percentile_inc_nulls\": 0.05845344066619873, \"value_count\": 913, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 913.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.054459989070892334, \"percentile_inc_nulls\": 0.055305659770965576, \"value_count\": 725, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 725.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05156141519546509, \"percentile_inc_nulls\": 0.05240970849990845, \"value_count\": 667, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0487019419670105, \"percentile_inc_nulls\": 0.0495527982711792, \"value_count\": 658, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0459250807762146, \"percentile_inc_nulls\": 0.04677838087081909, \"value_count\": 639, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 639.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04325681924819946, \"percentile_inc_nulls\": 0.044112563133239746, \"value_count\": 614, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 614.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04063636064529419, \"percentile_inc_nulls\": 0.04149442911148071, \"value_count\": 603, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 603.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03809851408004761, \"percentile_inc_nulls\": 0.038958847522735596, \"value_count\": 584, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03566926717758179, \"percentile_inc_nulls\": 0.036531805992126465, \"value_count\": 559, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 559.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03332263231277466, \"percentile_inc_nulls\": 0.0341871976852417, \"value_count\": 540, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.031075894832611084, \"percentile_inc_nulls\": 0.03194248676300049, \"value_count\": 517, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 517.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.029320240020751953, \"percentile_inc_nulls\": 0.030188441276550293, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.027595043182373047, \"percentile_inc_nulls\": 0.02846473455429077, \"value_count\": 397, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 397.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.026056647300720215, \"percentile_inc_nulls\": 0.02692776918411255, \"value_count\": 354, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.024522602558135986, \"percentile_inc_nulls\": 0.025395095348358154, \"value_count\": 353, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 353.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.02299731969833374, \"percentile_inc_nulls\": 0.023871123790740967, \"value_count\": 351, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.021511077880859375, \"percentile_inc_nulls\": 0.022386252880096436, \"value_count\": 342, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.020146548748016357, \"percentile_inc_nulls\": 0.021022915840148926, \"value_count\": 314, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 314.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.018903672695159912, \"percentile_inc_nulls\": 0.019781172275543213, \"value_count\": 286, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.017669498920440674, \"percentile_inc_nulls\": 0.018548130989074707, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.016574382781982422, \"percentile_inc_nulls\": 0.01745396852493286, \"value_count\": 252, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.015501022338867188, \"percentile_inc_nulls\": 0.016381561756134033, \"value_count\": 247, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 247.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.014471113681793213, \"percentile_inc_nulls\": 0.01535254716873169, \"value_count\": 237, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.013471603393554688, \"percentile_inc_nulls\": 0.014353930950164795, \"value_count\": 230, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.012498140335083008, \"percentile_inc_nulls\": 0.013381361961364746, \"value_count\": 224, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.011594235897064209, \"percentile_inc_nulls\": 0.012478291988372803, \"value_count\": 208, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010781586170196533, \"percentile_inc_nulls\": 0.011666357517242432, \"value_count\": 187, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010086297988891602, \"percentile_inc_nulls\": 0.010971665382385254, \"value_count\": 160, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008739173412322998, \"percentile_inc_nulls\": 0.009625732898712158, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008069932460784912, \"percentile_inc_nulls\": 0.008957087993621826, \"value_count\": 154, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007483243942260742, \"percentile_inc_nulls\": 0.008370935916900635, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007044315338134766, \"percentile_inc_nulls\": 0.007932424545288086, \"value_count\": 101, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006609737873077393, \"percentile_inc_nulls\": 0.007498264312744141, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0062621235847473145, \"percentile_inc_nulls\": 0.0071509480476379395, \"value_count\": 80, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005979657173156738, \"percentile_inc_nulls\": 0.006868720054626465, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005710184574127197, \"percentile_inc_nulls\": 0.006599485874176025, \"value_count\": 62, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005188703536987305, \"percentile_inc_nulls\": 0.006078481674194336, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004936695098876953, \"percentile_inc_nulls\": 0.0058266520500183105, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0047324299812316895, \"percentile_inc_nulls\": 0.0056226253509521484, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004536867141723633, \"percentile_inc_nulls\": 0.005427241325378418, \"value_count\": 45, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004384756088256836, \"percentile_inc_nulls\": 0.005275249481201172, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004089295864105225, \"percentile_inc_nulls\": 0.004980027675628662, \"value_count\": 34, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003945887088775635, \"percentile_inc_nulls\": 0.004836738109588623, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0038068294525146484, \"percentile_inc_nulls\": 0.0046977996826171875, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0036807656288146973, \"percentile_inc_nulls\": 0.0045719146728515625, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003559112548828125, \"percentile_inc_nulls\": 0.004450321197509766, \"value_count\": 28, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 28.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003207087516784668, \"percentile_inc_nulls\": 0.004098653793334961, \"value_count\": 27, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002981126308441162, \"percentile_inc_nulls\": 0.0038728713989257812, \"value_count\": 26, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0028768181800842285, \"percentile_inc_nulls\": 0.0037686824798583984, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0027768611907958984, \"percentile_inc_nulls\": 0.0036687850952148438, \"value_count\": 23, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002681255340576172, \"percentile_inc_nulls\": 0.003573298454284668, \"value_count\": 22, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.002590000629425049, \"percentile_inc_nulls\": 0.0034821033477783203, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 21.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0022423863410949707, \"percentile_inc_nulls\": 0.003134787082672119, \"value_count\": 20, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0019946694374084473, \"percentile_inc_nulls\": 0.0028873085975646973, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0019164681434631348, \"percentile_inc_nulls\": 0.0028091073036193848, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001846909523010254, \"percentile_inc_nulls\": 0.0027396678924560547, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001603543758392334, \"percentile_inc_nulls\": 0.0024965405464172363, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0014340877532958984, \"percentile_inc_nulls\": 0.0023272037506103516, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001277625560760498, \"percentile_inc_nulls\": 0.0021709203720092773, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0011342167854309082, \"percentile_inc_nulls\": 0.0020276308059692383, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.001003861427307129, \"percentile_inc_nulls\": 0.0018973350524902344, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0008864998817443848, \"percentile_inc_nulls\": 0.0017801523208618164, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 27.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0008170008659362793, \"percentile_inc_nulls\": 0.001710653305053711, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0005736351013183594, \"percentile_inc_nulls\": 0.0014675259590148926, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0003911256790161133, \"percentile_inc_nulls\": 0.0012851953506469727, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00028246641159057617, \"percentile_inc_nulls\": 0.001176595687866211, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00019556283950805664, \"percentile_inc_nulls\": 0.0010898113250732422, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 9.125471115112305e-05, \"percentile_inc_nulls\": 0.000985562801361084, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 3.910064697265625e-05, \"percentile_inc_nulls\": 0.0009334683418273926, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0008944272994995117, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 35075, \"group_name\": \"_state_\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 35075.0, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 206 values (0.1%) are null and there are 173 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 35075, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 27970, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 19386, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 10429, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 10221, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 9126, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 8554, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 8495, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 5995, \"group_name\": \"_state_\", \"value\": \"oh\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 5568, \"group_name\": \"_state_\", \"value\": \"va\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"y0\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"p2\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"h9\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 230114, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 35075]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9145660996437073, \"percentile_inc_nulls\": 0.9146057367324829, \"value_count\": 19668, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19668.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8862184286117554, \"percentile_inc_nulls\": 0.8862712979316711, \"value_count\": 6526, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6526.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8684000968933105, \"percentile_inc_nulls\": 0.8684612512588501, \"value_count\": 4102, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4102.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8539091944694519, \"percentile_inc_nulls\": 0.8539770841598511, \"value_count\": 3336, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3336.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8407561779022217, \"percentile_inc_nulls\": 0.8408301472663879, \"value_count\": 3028, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3028.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8295056819915771, \"percentile_inc_nulls\": 0.8295849561691284, \"value_count\": 2590, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2590.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8183161020278931, \"percentile_inc_nulls\": 0.818400502204895, \"value_count\": 2576, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2576.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.8077041506767273, \"percentile_inc_nulls\": 0.8077934980392456, \"value_count\": 2443, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2443.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7973007559776306, \"percentile_inc_nulls\": 0.7973949313163757, \"value_count\": 2395, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2395.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7871319055557251, \"percentile_inc_nulls\": 0.7872307896614075, \"value_count\": 2341, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2341.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7781966924667358, \"percentile_inc_nulls\": 0.7782997488975525, \"value_count\": 2057, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2057.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7695308327674866, \"percentile_inc_nulls\": 0.7696378827095032, \"value_count\": 1995, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1995.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.761177659034729, \"percentile_inc_nulls\": 0.7612886428833008, \"value_count\": 1923, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1923.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7531590461730957, \"percentile_inc_nulls\": 0.7532737255096436, \"value_count\": 1846, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1846.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7454574704170227, \"percentile_inc_nulls\": 0.7455757260322571, \"value_count\": 1773, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1773.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7377602458000183, \"percentile_inc_nulls\": 0.7378820776939392, \"value_count\": 1772, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1772.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7311316132545471, \"percentile_inc_nulls\": 0.7312564849853516, \"value_count\": 1526, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1526.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.72495037317276, \"percentile_inc_nulls\": 0.7250781655311584, \"value_count\": 1423, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1423.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7190427780151367, \"percentile_inc_nulls\": 0.7191733121871948, \"value_count\": 1360, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1360.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.713752031326294, \"percentile_inc_nulls\": 0.7138850688934326, \"value_count\": 1218, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1218.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7086306810379028, \"percentile_inc_nulls\": 0.7087661027908325, \"value_count\": 1179, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1179.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.7035441398620605, \"percentile_inc_nulls\": 0.7036818265914917, \"value_count\": 1171, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1171.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6986442804336548, \"percentile_inc_nulls\": 0.6987842917442322, \"value_count\": 1128, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1128.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6939703226089478, \"percentile_inc_nulls\": 0.6941125392913818, \"value_count\": 1076, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6893181800842285, \"percentile_inc_nulls\": 0.6894624829292297, \"value_count\": 1071, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1071.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6846963167190552, \"percentile_inc_nulls\": 0.6848428249359131, \"value_count\": 1064, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1064.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6800962686538696, \"percentile_inc_nulls\": 0.6802448630332947, \"value_count\": 1059, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1059.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.675557017326355, \"percentile_inc_nulls\": 0.6757076978683472, \"value_count\": 1045, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1045.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6712740063667297, \"percentile_inc_nulls\": 0.6714267134666443, \"value_count\": 986, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 986.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6669953465461731, \"percentile_inc_nulls\": 0.6671500205993652, \"value_count\": 985, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 985.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6627557873725891, \"percentile_inc_nulls\": 0.6629124879837036, \"value_count\": 976, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 976.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.658542275428772, \"percentile_inc_nulls\": 0.6587009429931641, \"value_count\": 970, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 970.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6544721722602844, \"percentile_inc_nulls\": 0.6546326875686646, \"value_count\": 937, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 937.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6505627632141113, \"percentile_inc_nulls\": 0.6507250666618347, \"value_count\": 900, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6466923952102661, \"percentile_inc_nulls\": 0.6468565464019775, \"value_count\": 891, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 891.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6429871320724487, \"percentile_inc_nulls\": 0.6431530117988586, \"value_count\": 853, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 853.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6356287002563477, \"percentile_inc_nulls\": 0.6357979774475098, \"value_count\": 847, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1694.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.632053792476654, \"percentile_inc_nulls\": 0.6322247385978699, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6285179257392883, \"percentile_inc_nulls\": 0.6286904811859131, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.624999463558197, \"percentile_inc_nulls\": 0.6251736879348755, \"value_count\": 810, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6215765476226807, \"percentile_inc_nulls\": 0.6217523813247681, \"value_count\": 788, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 788.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6181753277778625, \"percentile_inc_nulls\": 0.6183527708053589, \"value_count\": 783, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6148914098739624, \"percentile_inc_nulls\": 0.6150703430175781, \"value_count\": 756, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6116161942481995, \"percentile_inc_nulls\": 0.6117966175079346, \"value_count\": 754, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 754.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.608384370803833, \"percentile_inc_nulls\": 0.6085663437843323, \"value_count\": 744, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6051656603813171, \"percentile_inc_nulls\": 0.605349063873291, \"value_count\": 741, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 741.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.6019772887229919, \"percentile_inc_nulls\": 0.60216224193573, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5987932682037354, \"percentile_inc_nulls\": 0.5989797115325928, \"value_count\": 733, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 733.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5956918001174927, \"percentile_inc_nulls\": 0.5958796739578247, \"value_count\": 714, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5925990343093872, \"percentile_inc_nulls\": 0.5927883386611938, \"value_count\": 712, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.58952796459198, \"percentile_inc_nulls\": 0.5897186398506165, \"value_count\": 707, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 707.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5864698886871338, \"percentile_inc_nulls\": 0.5866620540618896, \"value_count\": 704, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 704.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5835595726966858, \"percentile_inc_nulls\": 0.5837530493736267, \"value_count\": 670, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 670.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5777822732925415, \"percentile_inc_nulls\": 0.5779784917831421, \"value_count\": 665, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5748980045318604, \"percentile_inc_nulls\": 0.575095534324646, \"value_count\": 664, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 664.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5720224380493164, \"percentile_inc_nulls\": 0.5722212791442871, \"value_count\": 662, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 662.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5663493871688843, \"percentile_inc_nulls\": 0.5665508508682251, \"value_count\": 653, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1306.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5635216236114502, \"percentile_inc_nulls\": 0.5637243986129761, \"value_count\": 651, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 651.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5607241988182068, \"percentile_inc_nulls\": 0.5609282851219177, \"value_count\": 644, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.557952880859375, \"percentile_inc_nulls\": 0.5581582188606262, \"value_count\": 638, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 638.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.555185854434967, \"percentile_inc_nulls\": 0.5553925037384033, \"value_count\": 637, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 637.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5524709820747375, \"percentile_inc_nulls\": 0.5526788830757141, \"value_count\": 625, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 625.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5470933318138123, \"percentile_inc_nulls\": 0.5473037958145142, \"value_count\": 619, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1238.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5444697141647339, \"percentile_inc_nulls\": 0.5446813106536865, \"value_count\": 604, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 604.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5418981313705444, \"percentile_inc_nulls\": 0.5421109795570374, \"value_count\": 592, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.539383053779602, \"percentile_inc_nulls\": 0.5395970940589905, \"value_count\": 579, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 579.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.536933183670044, \"percentile_inc_nulls\": 0.5371483564376831, \"value_count\": 564, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5345006585121155, \"percentile_inc_nulls\": 0.5347169041633606, \"value_count\": 560, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5296703577041626, \"percentile_inc_nulls\": 0.5298888683319092, \"value_count\": 556, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1112.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5273073315620422, \"percentile_inc_nulls\": 0.5275269150733948, \"value_count\": 544, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.525026798248291, \"percentile_inc_nulls\": 0.5252474546432495, \"value_count\": 525, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5227984189987183, \"percentile_inc_nulls\": 0.5230201482772827, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5206004977226257, \"percentile_inc_nulls\": 0.5208232402801514, \"value_count\": 506, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5185198187828064, \"percentile_inc_nulls\": 0.5187435150146484, \"value_count\": 479, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 479.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5144105553627014, \"percentile_inc_nulls\": 0.5146361589431763, \"value_count\": 473, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 946.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5123603343963623, \"percentile_inc_nulls\": 0.5125868320465088, \"value_count\": 472, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5083205699920654, \"percentile_inc_nulls\": 0.5085489749908447, \"value_count\": 465, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 930.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5063441395759583, \"percentile_inc_nulls\": 0.5065734386444092, \"value_count\": 455, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5043807029724121, \"percentile_inc_nulls\": 0.5046110153198242, \"value_count\": 452, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5024434328079224, \"percentile_inc_nulls\": 0.5026745796203613, \"value_count\": 446, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 446.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.5005277395248413, \"percentile_inc_nulls\": 0.5007598400115967, \"value_count\": 441, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 441.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4986165165901184, \"percentile_inc_nulls\": 0.49884945154190063, \"value_count\": 440, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.49672257900238037, \"percentile_inc_nulls\": 0.49695640802383423, \"value_count\": 436, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 436.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4948808550834656, \"percentile_inc_nulls\": 0.4951155185699463, \"value_count\": 424, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.49305206537246704, \"percentile_inc_nulls\": 0.4932876229286194, \"value_count\": 421, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 421.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4912320375442505, \"percentile_inc_nulls\": 0.4914683699607849, \"value_count\": 419, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 419.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4894424080848694, \"percentile_inc_nulls\": 0.48967957496643066, \"value_count\": 412, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4876744747161865, \"percentile_inc_nulls\": 0.48791247606277466, \"value_count\": 407, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 407.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.48598039150238037, \"percentile_inc_nulls\": 0.4862191677093506, \"value_count\": 390, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.48432105779647827, \"percentile_inc_nulls\": 0.48456060886383057, \"value_count\": 382, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 382.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4826660752296448, \"percentile_inc_nulls\": 0.48290640115737915, \"value_count\": 381, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4810153841972351, \"percentile_inc_nulls\": 0.48125648498535156, \"value_count\": 380, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4793734550476074, \"percentile_inc_nulls\": 0.47961533069610596, \"value_count\": 378, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4777619242668152, \"percentile_inc_nulls\": 0.478004515171051, \"value_count\": 371, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 371.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4761546850204468, \"percentile_inc_nulls\": 0.4763980507850647, \"value_count\": 370, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4729924201965332, \"percentile_inc_nulls\": 0.47323721647262573, \"value_count\": 364, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 728.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.47143298387527466, \"percentile_inc_nulls\": 0.47167855501174927, \"value_count\": 359, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 359.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.46989959478378296, \"percentile_inc_nulls\": 0.4701458811759949, \"value_count\": 353, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 353.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4683836102485657, \"percentile_inc_nulls\": 0.4686306118965149, \"value_count\": 349, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4668763279914856, \"percentile_inc_nulls\": 0.46712398529052734, \"value_count\": 347, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4653733968734741, \"percentile_inc_nulls\": 0.4656217694282532, \"value_count\": 346, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 346.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4638834595680237, \"percentile_inc_nulls\": 0.46413248777389526, \"value_count\": 343, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.46240222454071045, \"percentile_inc_nulls\": 0.46265196800231934, \"value_count\": 341, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 341.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4609253406524658, \"percentile_inc_nulls\": 0.46117573976516724, \"value_count\": 340, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.459452748298645, \"percentile_inc_nulls\": 0.45970386266708374, \"value_count\": 339, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4579976201057434, \"percentile_inc_nulls\": 0.45824939012527466, \"value_count\": 335, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 335.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45655542612075806, \"percentile_inc_nulls\": 0.4568079113960266, \"value_count\": 332, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 332.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4551176428794861, \"percentile_inc_nulls\": 0.45537078380584717, \"value_count\": 331, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 331.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45371025800704956, \"percentile_inc_nulls\": 0.4539640545845032, \"value_count\": 324, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.45231592655181885, \"percentile_inc_nulls\": 0.4525703191757202, \"value_count\": 321, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 321.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4509432315826416, \"percentile_inc_nulls\": 0.4511983394622803, \"value_count\": 316, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4496009945869446, \"percentile_inc_nulls\": 0.449856698513031, \"value_count\": 309, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44700342416763306, \"percentile_inc_nulls\": 0.447260320186615, \"value_count\": 299, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 598.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4457089900970459, \"percentile_inc_nulls\": 0.4459664821624756, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44442319869995117, \"percentile_inc_nulls\": 0.4446812868118286, \"value_count\": 296, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.44314175844192505, \"percentile_inc_nulls\": 0.443400502204895, \"value_count\": 295, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 295.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4405876398086548, \"percentile_inc_nulls\": 0.4408475160598755, \"value_count\": 294, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43931490182876587, \"percentile_inc_nulls\": 0.4395753741264343, \"value_count\": 293, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 293.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43805521726608276, \"percentile_inc_nulls\": 0.4383162260055542, \"value_count\": 290, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.435544490814209, \"percentile_inc_nulls\": 0.4358066916465759, \"value_count\": 289, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 578.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4342978000640869, \"percentile_inc_nulls\": 0.4345605969429016, \"value_count\": 287, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43305546045303345, \"percentile_inc_nulls\": 0.4333188533782959, \"value_count\": 286, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4318174719810486, \"percentile_inc_nulls\": 0.4320814609527588, \"value_count\": 285, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.43059688806533813, \"percentile_inc_nulls\": 0.4308614134788513, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 281.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4293805956840515, \"percentile_inc_nulls\": 0.42964571714401245, \"value_count\": 280, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4270002245903015, \"percentile_inc_nulls\": 0.4272664189338684, \"value_count\": 274, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 548.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42581868171691895, \"percentile_inc_nulls\": 0.4260854721069336, \"value_count\": 272, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42464590072631836, \"percentile_inc_nulls\": 0.4249131679534912, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.42349910736083984, \"percentile_inc_nulls\": 0.4237669110298157, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4212229251861572, \"percentile_inc_nulls\": 0.4214918613433838, \"value_count\": 262, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 524.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4189555048942566, \"percentile_inc_nulls\": 0.41922545433044434, \"value_count\": 261, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 522.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4178304672241211, \"percentile_inc_nulls\": 0.41810089349746704, \"value_count\": 259, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.41558903455734253, \"percentile_inc_nulls\": 0.41586053371429443, \"value_count\": 258, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4133650064468384, \"percentile_inc_nulls\": 0.41363751888275146, \"value_count\": 256, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 512.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.41115838289260864, \"percentile_inc_nulls\": 0.4114319086074829, \"value_count\": 254, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 508.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.410072386264801, \"percentile_inc_nulls\": 0.41034644842147827, \"value_count\": 250, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4057633876800537, \"percentile_inc_nulls\": 0.40603941679000854, \"value_count\": 248, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 992.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4046904444694519, \"percentile_inc_nulls\": 0.4049670100212097, \"value_count\": 247, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 247.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4036218523979187, \"percentile_inc_nulls\": 0.4038988947868347, \"value_count\": 246, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4025663137435913, \"percentile_inc_nulls\": 0.4028438925743103, \"value_count\": 243, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.40152817964553833, \"percentile_inc_nulls\": 0.40180617570877075, \"value_count\": 239, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 239.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.4004986882209778, \"percentile_inc_nulls\": 0.4007771611213684, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3984571099281311, \"percentile_inc_nulls\": 0.39873653650283813, \"value_count\": 235, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 470.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3974406123161316, \"percentile_inc_nulls\": 0.3977205753326416, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39642852544784546, \"percentile_inc_nulls\": 0.3967089056968689, \"value_count\": 233, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 233.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39343130588531494, \"percentile_inc_nulls\": 0.393713116645813, \"value_count\": 230, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 690.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3924669623374939, \"percentile_inc_nulls\": 0.39274919033050537, \"value_count\": 222, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.39057308435440063, \"percentile_inc_nulls\": 0.39085620641708374, \"value_count\": 218, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 436.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3896304965019226, \"percentile_inc_nulls\": 0.38991403579711914, \"value_count\": 217, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 217.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.388696551322937, \"percentile_inc_nulls\": 0.38898056745529175, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38684606552124023, \"percentile_inc_nulls\": 0.3871309757232666, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3859252333641052, \"percentile_inc_nulls\": 0.38621050119400024, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38500869274139404, \"percentile_inc_nulls\": 0.3852943778038025, \"value_count\": 211, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3841007947921753, \"percentile_inc_nulls\": 0.38438695669174194, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3831973075866699, \"percentile_inc_nulls\": 0.3834838271141052, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.382302463054657, \"percentile_inc_nulls\": 0.3825894594192505, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.38141196966171265, \"percentile_inc_nulls\": 0.3816993832588196, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3805258870124817, \"percentile_inc_nulls\": 0.3808136582374573, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3796483874320984, \"percentile_inc_nulls\": 0.3799366354942322, \"value_count\": 202, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37877529859542847, \"percentile_inc_nulls\": 0.3790639042854309, \"value_count\": 201, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37790656089782715, \"percentile_inc_nulls\": 0.37819552421569824, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37704646587371826, \"percentile_inc_nulls\": 0.37733590602874756, \"value_count\": 198, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3761950731277466, \"percentile_inc_nulls\": 0.37648487091064453, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37369304895401, \"percentile_inc_nulls\": 0.37398403882980347, \"value_count\": 192, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.37286776304244995, \"percentile_inc_nulls\": 0.37315911054611206, \"value_count\": 190, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3712257742881775, \"percentile_inc_nulls\": 0.3715178966522217, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3687758445739746, \"percentile_inc_nulls\": 0.36906909942626953, \"value_count\": 188, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3679679036140442, \"percentile_inc_nulls\": 0.36826157569885254, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3663780689239502, \"percentile_inc_nulls\": 0.36667245626449585, \"value_count\": 183, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3655875325202942, \"percentile_inc_nulls\": 0.3658822774887085, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.364801287651062, \"percentile_inc_nulls\": 0.36509639024734497, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36401939392089844, \"percentile_inc_nulls\": 0.36431485414505005, \"value_count\": 180, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36324185132980347, \"percentile_inc_nulls\": 0.36353766918182373, \"value_count\": 179, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 179.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.36170417070388794, \"percentile_inc_nulls\": 0.3620007038116455, \"value_count\": 177, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3594106435775757, \"percentile_inc_nulls\": 0.3597082495689392, \"value_count\": 176, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3578903079032898, \"percentile_inc_nulls\": 0.3581886291503906, \"value_count\": 175, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3571431636810303, \"percentile_inc_nulls\": 0.35744184255599976, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.35565757751464844, \"percentile_inc_nulls\": 0.35595691204071045, \"value_count\": 171, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3534422516822815, \"percentile_inc_nulls\": 0.3537425994873047, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3519740700721741, \"percentile_inc_nulls\": 0.3522750735282898, \"value_count\": 169, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3512442708015442, \"percentile_inc_nulls\": 0.35154569149017334, \"value_count\": 168, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3505188822746277, \"percentile_inc_nulls\": 0.3508206009864807, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.34908539056777954, \"percentile_inc_nulls\": 0.3493878245353699, \"value_count\": 165, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3476606607437134, \"percentile_inc_nulls\": 0.34796369075775146, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 328.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3455886244773865, \"percentile_inc_nulls\": 0.34589266777038574, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.344215989112854, \"percentile_inc_nulls\": 0.3445206880569458, \"value_count\": 158, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3421700596809387, \"percentile_inc_nulls\": 0.3424757122993469, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3414924144744873, \"percentile_inc_nulls\": 0.3417983651161194, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.34014588594436646, \"percentile_inc_nulls\": 0.3404524326324463, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3394855856895447, \"percentile_inc_nulls\": 0.33979249000549316, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3388296961784363, \"percentile_inc_nulls\": 0.33913683891296387, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3381824493408203, \"percentile_inc_nulls\": 0.33848994970321655, \"value_count\": 149, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 149.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33689671754837036, \"percentile_inc_nulls\": 0.3372047543525696, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3362581729888916, \"percentile_inc_nulls\": 0.3365665078163147, \"value_count\": 147, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33562833070755005, \"percentile_inc_nulls\": 0.335936963558197, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3350027799606323, \"percentile_inc_nulls\": 0.3353117108345032, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33189696073532104, \"percentile_inc_nulls\": 0.3322073817253113, \"value_count\": 143, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 715.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3312801718711853, \"percentile_inc_nulls\": 0.33159083127975464, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.33005523681640625, \"percentile_inc_nulls\": 0.33036643266677856, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3282307982444763, \"percentile_inc_nulls\": 0.32854288816452026, \"value_count\": 140, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3270232081413269, \"percentile_inc_nulls\": 0.3273358941078186, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3264237642288208, \"percentile_inc_nulls\": 0.3267366886138916, \"value_count\": 138, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3258286714553833, \"percentile_inc_nulls\": 0.326141893863678, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3246471881866455, \"percentile_inc_nulls\": 0.32496094703674316, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.32288795709609985, \"percentile_inc_nulls\": 0.3232024908065796, \"value_count\": 135, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 405.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3217238187789917, \"percentile_inc_nulls\": 0.3220388889312744, \"value_count\": 134, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 268.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.32114607095718384, \"percentile_inc_nulls\": 0.32146143913269043, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3205726742744446, \"percentile_inc_nulls\": 0.32088834047317505, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3188655972480774, \"percentile_inc_nulls\": 0.31918197870254517, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 393.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31717151403427124, \"percentile_inc_nulls\": 0.31748872995376587, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31605076789855957, \"percentile_inc_nulls\": 0.3163685202598572, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.31550347805023193, \"percentile_inc_nulls\": 0.31582146883010864, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3138875961303711, \"percentile_inc_nulls\": 0.3142063021659851, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3107079267501831, \"percentile_inc_nulls\": 0.3110281229019165, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 732.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3101823329925537, \"percentile_inc_nulls\": 0.3105027675628662, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30966103076934814, \"percentile_inc_nulls\": 0.3099817633628845, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3081102967262268, \"percentile_inc_nulls\": 0.3084317445755005, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 357.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30708515644073486, \"percentile_inc_nulls\": 0.30740708112716675, \"value_count\": 118, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3065769672393799, \"percentile_inc_nulls\": 0.3068990707397461, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3050653338432312, \"percentile_inc_nulls\": 0.3053881525993347, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30456578731536865, \"percentile_inc_nulls\": 0.30488884449005127, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30309319496154785, \"percentile_inc_nulls\": 0.3034169673919678, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.30163371562957764, \"percentile_inc_nulls\": 0.3019581437110901, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.3011515140533447, \"percentile_inc_nulls\": 0.3014761805534363, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29971808195114136, \"percentile_inc_nulls\": 0.30004340410232544, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.298297643661499, \"percentile_inc_nulls\": 0.2986236810684204, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29454463720321655, \"percentile_inc_nulls\": 0.29487234354019165, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 864.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29361504316329956, \"percentile_inc_nulls\": 0.29394322633743286, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.29177325963974, \"percentile_inc_nulls\": 0.29210227727890015, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.28949278593063354, \"percentile_inc_nulls\": 0.28982287645339966, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.28904104232788086, \"percentile_inc_nulls\": 0.2893713116645813, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2881461977958679, \"percentile_inc_nulls\": 0.2884768843650818, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2854877710342407, \"percentile_inc_nulls\": 0.2858197093009949, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 612.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2841716408729553, \"percentile_inc_nulls\": 0.2845041751861572, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.282881498336792, \"percentile_inc_nulls\": 0.28321462869644165, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2816044092178345, \"percentile_inc_nulls\": 0.28193819522857666, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.27907633781433105, \"percentile_inc_nulls\": 0.279411256313324, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2769913077354431, \"percentile_inc_nulls\": 0.27732717990875244, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2749279737472534, \"percentile_inc_nulls\": 0.2752648591995239, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 475.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2724781036376953, \"percentile_inc_nulls\": 0.2728160619735718, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2712661623954773, \"percentile_inc_nulls\": 0.2716047167778015, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2708665728569031, \"percentile_inc_nulls\": 0.2712053060531616, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2684948444366455, \"percentile_inc_nulls\": 0.26883465051651, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26732200384140015, \"percentile_inc_nulls\": 0.2676624059677124, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2661622166633606, \"percentile_inc_nulls\": 0.26650315523147583, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2646331787109375, \"percentile_inc_nulls\": 0.26497483253479004, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26274365186691284, \"percentile_inc_nulls\": 0.26308614015579224, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.26162290573120117, \"percentile_inc_nulls\": 0.2619659900665283, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25903838872909546, \"percentile_inc_nulls\": 0.25938260555267334, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25684911012649536, \"percentile_inc_nulls\": 0.2571943402290344, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2554069757461548, \"percentile_inc_nulls\": 0.2557528614997864, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 332.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2532697916030884, \"percentile_inc_nulls\": 0.25361669063568115, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.252214252948761, \"percentile_inc_nulls\": 0.25256162881851196, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.25151926279067993, \"percentile_inc_nulls\": 0.25186699628829956, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24946027994155884, \"percentile_inc_nulls\": 0.24980896711349487, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24607211351394653, \"percentile_inc_nulls\": 0.24642235040664673, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2440652847290039, \"percentile_inc_nulls\": 0.2444164752960205, \"value_count\": 77, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.24109411239624023, \"percentile_inc_nulls\": 0.24144667387008667, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2388136386871338, \"percentile_inc_nulls\": 0.23916727304458618, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23784929513931274, \"percentile_inc_nulls\": 0.23820334672927856, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2368980050086975, \"percentile_inc_nulls\": 0.23725253343582153, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2347087264060974, \"percentile_inc_nulls\": 0.23506426811218262, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23285824060440063, \"percentile_inc_nulls\": 0.23321467638015747, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23133796453475952, \"percentile_inc_nulls\": 0.2316950559616089, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.23013901710510254, \"percentile_inc_nulls\": 0.23049670457839966, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.22777599096298218, \"percentile_inc_nulls\": 0.22813475131988525, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.22661185264587402, \"percentile_inc_nulls\": 0.22697114944458008, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 268.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2246050238609314, \"percentile_inc_nulls\": 0.22496527433395386, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2212168574333191, \"percentile_inc_nulls\": 0.2215786576271057, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21760284900665283, \"percentile_inc_nulls\": 0.21796631813049316, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 832.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21486622095108032, \"percentile_inc_nulls\": 0.2152310013771057, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 630.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.21217310428619385, \"percentile_inc_nulls\": 0.21253907680511475, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 620.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.20925837755203247, \"percentile_inc_nulls\": 0.2096257209777832, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 671.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2082158923149109, \"percentile_inc_nulls\": 0.20858371257781982, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.206165611743927, \"percentile_inc_nulls\": 0.20653438568115234, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.20415008068084717, \"percentile_inc_nulls\": 0.20451980829238892, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.2011789083480835, \"percentile_inc_nulls\": 0.20155000686645508, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 684.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19874638319015503, \"percentile_inc_nulls\": 0.19911861419677734, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19611835479736328, \"percentile_inc_nulls\": 0.19649183750152588, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 605.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1944764256477356, \"percentile_inc_nulls\": 0.1948506236076355, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19171375036239624, \"percentile_inc_nulls\": 0.1920892596244812, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.19035851955413818, \"percentile_inc_nulls\": 0.1907346248626709, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.18858623504638672, \"percentile_inc_nulls\": 0.1889631748199463, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.185979962348938, \"percentile_inc_nulls\": 0.18635809421539307, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 600.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1830000877380371, \"percentile_inc_nulls\": 0.1833796501159668, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 686.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.18028956651687622, \"percentile_inc_nulls\": 0.1806703805923462, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.17763549089431763, \"percentile_inc_nulls\": 0.17801755666732788, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 611.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1750379204750061, \"percentile_inc_nulls\": 0.17542117834091187, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 598.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.17151939868927002, \"percentile_inc_nulls\": 0.1719043254852295, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16960811614990234, \"percentile_inc_nulls\": 0.16999393701553345, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.167927086353302, \"percentile_inc_nulls\": 0.1683136224746704, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 387.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16555535793304443, \"percentile_inc_nulls\": 0.16594302654266357, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16324013471603394, \"percentile_inc_nulls\": 0.16362887620925903, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.16046011447906494, \"percentile_inc_nulls\": 0.16085010766983032, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15808838605880737, \"percentile_inc_nulls\": 0.1584795117378235, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1557774543762207, \"percentile_inc_nulls\": 0.15616965293884277, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15256303548812866, \"percentile_inc_nulls\": 0.1529567837715149, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 740.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.15131205320358276, \"percentile_inc_nulls\": 0.15170633792877197, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.148271381855011, \"percentile_inc_nulls\": 0.1486670970916748, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.14546531438827515, \"percentile_inc_nulls\": 0.14586228132247925, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 646.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.14202499389648438, \"percentile_inc_nulls\": 0.1424235701560974, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 792.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1379939317703247, \"percentile_inc_nulls\": 0.13839441537857056, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.13462752103805542, \"percentile_inc_nulls\": 0.13502949476242065, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 775.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.12980586290359497, \"percentile_inc_nulls\": 0.13021016120910645, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1110.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1256488561630249, \"percentile_inc_nulls\": 0.1260550618171692, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 957.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.1206621527671814, \"percentile_inc_nulls\": 0.12107068300247192, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1148.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.11585360765457153, \"percentile_inc_nulls\": 0.11626434326171875, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1107.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.11280423402786255, \"percentile_inc_nulls\": 0.11321640014648438, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.10932916402816772, \"percentile_inc_nulls\": 0.10974293947219849, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 800.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.10568040609359741, \"percentile_inc_nulls\": 0.10609585046768188, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09978586435317993, \"percentile_inc_nulls\": 0.10020405054092407, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1357.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09586775302886963, \"percentile_inc_nulls\": 0.0962877869606018, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 902.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.09130674600601196, \"percentile_inc_nulls\": 0.09172892570495605, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.08800548315048218, \"percentile_inc_nulls\": 0.08842915296554565, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.08363121747970581, \"percentile_inc_nulls\": 0.08405697345733643, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1007.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07854896783828735, \"percentile_inc_nulls\": 0.07897704839706421, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1170.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07478290796279907, \"percentile_inc_nulls\": 0.07521277666091919, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 867.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.07096034288406372, \"percentile_inc_nulls\": 0.0713919997215271, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.06633424758911133, \"percentile_inc_nulls\": 0.06676799058914185, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1065.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.06256377696990967, \"percentile_inc_nulls\": 0.06299930810928345, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 868.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.05793333053588867, \"percentile_inc_nulls\": 0.05837094783782959, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1066.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.05444091558456421, \"percentile_inc_nulls\": 0.05488014221191406, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 804.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.050666093826293945, \"percentile_inc_nulls\": 0.051107168197631836, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 869.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.045974791049957275, \"percentile_inc_nulls\": 0.04641801118850708, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1080.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.04089254140853882, \"percentile_inc_nulls\": 0.04133814573287964, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1170.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.03585374355316162, \"percentile_inc_nulls\": 0.03630167245864868, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1160.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0304718017578125, \"percentile_inc_nulls\": 0.03092217445373535, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1239.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.025806546211242676, \"percentile_inc_nulls\": 0.026259124279022217, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1074.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0207025408744812, \"percentile_inc_nulls\": 0.021157503128051758, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1175.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.014812350273132324, \"percentile_inc_nulls\": 0.015270054340362549, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1356.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.009208858013153076, \"percentile_inc_nulls\": 0.009669184684753418, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1290.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.003831207752227783, \"percentile_inc_nulls\": 0.004294037818908691, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1238.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00046455860137939453, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 882.0, \"distinct_value_count\": 5233}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 19668, \"group_name\": \"_city_\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 19668.0, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 107 values (0.0%) are null and there are 5233 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 19668, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 6526, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 4102, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 3336, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 3028, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2590, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2576, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2443, \"group_name\": \"_city_\", \"value\": \"atlanta\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2395, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 2341, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"del. miguel hidalgo\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"restonn\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"france\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"alachva\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"la plata,\", \"total_non_null_rows\": 230213, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 5233}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 19668]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9960616827011108, \"percentile_inc_nulls\": 0.9960663318634033, \"value_count\": 906, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 906.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9925928711891174, \"percentile_inc_nulls\": 0.9926015734672546, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9893587231636047, \"percentile_inc_nulls\": 0.9893712997436523, \"value_count\": 744, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9866940975189209, \"percentile_inc_nulls\": 0.9867097735404968, \"value_count\": 613, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.984146773815155, \"percentile_inc_nulls\": 0.9841654896736145, \"value_count\": 586, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9820907115936279, \"percentile_inc_nulls\": 0.9821118712425232, \"value_count\": 473, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 473.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9800606966018677, \"percentile_inc_nulls\": 0.980084240436554, \"value_count\": 467, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 467.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9781697988510132, \"percentile_inc_nulls\": 0.9781955480575562, \"value_count\": 435, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9763049483299255, \"percentile_inc_nulls\": 0.9763329029083252, \"value_count\": 429, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9745357632637024, \"percentile_inc_nulls\": 0.9745658040046692, \"value_count\": 407, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 407.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.97307950258255, \"percentile_inc_nulls\": 0.9731113314628601, \"value_count\": 335, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 335.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.971675455570221, \"percentile_inc_nulls\": 0.9717089533805847, \"value_count\": 323, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 323.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9704322814941406, \"percentile_inc_nulls\": 0.9704671502113342, \"value_count\": 286, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9692150950431824, \"percentile_inc_nulls\": 0.9692514538764954, \"value_count\": 280, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9680327773094177, \"percentile_inc_nulls\": 0.9680705070495605, \"value_count\": 272, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9668765068054199, \"percentile_inc_nulls\": 0.9669156074523926, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9657419323921204, \"percentile_inc_nulls\": 0.9657824039459229, \"value_count\": 261, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9646291136741638, \"percentile_inc_nulls\": 0.9646708965301514, \"value_count\": 256, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9635206460952759, \"percentile_inc_nulls\": 0.9635637402534485, \"value_count\": 255, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9624295830726624, \"percentile_inc_nulls\": 0.9624739289283752, \"value_count\": 251, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9614384770393372, \"percentile_inc_nulls\": 0.9614840149879456, \"value_count\": 228, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9604865312576294, \"percentile_inc_nulls\": 0.9605331420898438, \"value_count\": 219, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9595432281494141, \"percentile_inc_nulls\": 0.9595910310745239, \"value_count\": 217, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 217.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9586129784584045, \"percentile_inc_nulls\": 0.9586618542671204, \"value_count\": 214, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.957704484462738, \"percentile_inc_nulls\": 0.9577544331550598, \"value_count\": 209, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9568394422531128, \"percentile_inc_nulls\": 0.9568904042243958, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9559961557388306, \"percentile_inc_nulls\": 0.9560481309890747, \"value_count\": 194, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9551658630371094, \"percentile_inc_nulls\": 0.9552188515663147, \"value_count\": 191, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9543443322181702, \"percentile_inc_nulls\": 0.9543982148170471, \"value_count\": 189, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9535270929336548, \"percentile_inc_nulls\": 0.9535819888114929, \"value_count\": 188, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9527316093444824, \"percentile_inc_nulls\": 0.9527873992919922, \"value_count\": 183, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9511493444442749, \"percentile_inc_nulls\": 0.9512070417404175, \"value_count\": 182, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9504103660583496, \"percentile_inc_nulls\": 0.950468897819519, \"value_count\": 170, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9489932656288147, \"percentile_inc_nulls\": 0.9490534663200378, \"value_count\": 163, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 326.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9482933878898621, \"percentile_inc_nulls\": 0.9483544826507568, \"value_count\": 161, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9469023942947388, \"percentile_inc_nulls\": 0.9469650983810425, \"value_count\": 160, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9462199211120605, \"percentile_inc_nulls\": 0.9462834596633911, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9455461502075195, \"percentile_inc_nulls\": 0.9456104636192322, \"value_count\": 155, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9449158310890198, \"percentile_inc_nulls\": 0.9449809193611145, \"value_count\": 145, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9443159699440002, \"percentile_inc_nulls\": 0.9443817138671875, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9437291622161865, \"percentile_inc_nulls\": 0.9437955617904663, \"value_count\": 135, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9431466460227966, \"percentile_inc_nulls\": 0.9432138204574585, \"value_count\": 134, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9425771832466125, \"percentile_inc_nulls\": 0.9426450133323669, \"value_count\": 131, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9420294761657715, \"percentile_inc_nulls\": 0.9420979619026184, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.941486120223999, \"percentile_inc_nulls\": 0.9415552020072937, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9409471154212952, \"percentile_inc_nulls\": 0.9410168528556824, \"value_count\": 124, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9404211044311523, \"percentile_inc_nulls\": 0.9404914975166321, \"value_count\": 121, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9399082064628601, \"percentile_inc_nulls\": 0.9399791359901428, \"value_count\": 118, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9389083981513977, \"percentile_inc_nulls\": 0.9389805793762207, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9384128451347351, \"percentile_inc_nulls\": 0.9384855628013611, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9379216432571411, \"percentile_inc_nulls\": 0.9379949569702148, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9374391436576843, \"percentile_inc_nulls\": 0.937512993812561, \"value_count\": 111, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9369609951972961, \"percentile_inc_nulls\": 0.9370354413986206, \"value_count\": 110, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9364871978759766, \"percentile_inc_nulls\": 0.936562180519104, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9360220432281494, \"percentile_inc_nulls\": 0.9360976219177246, \"value_count\": 107, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9355655908584595, \"percentile_inc_nulls\": 0.9356417059898376, \"value_count\": 105, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9346614480018616, \"percentile_inc_nulls\": 0.9347386360168457, \"value_count\": 104, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9342137575149536, \"percentile_inc_nulls\": 0.934291422367096, \"value_count\": 103, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9337703585624695, \"percentile_inc_nulls\": 0.9338485598564148, \"value_count\": 102, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.933331310749054, \"percentile_inc_nulls\": 0.9334100484848022, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9329009652137756, \"percentile_inc_nulls\": 0.9329801797866821, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9324749708175659, \"percentile_inc_nulls\": 0.9325547218322754, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9320533275604248, \"percentile_inc_nulls\": 0.9321335554122925, \"value_count\": 97, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 97.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.931218683719635, \"percentile_inc_nulls\": 0.9312999248504639, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9303928017616272, \"percentile_inc_nulls\": 0.9304749965667725, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9295755624771118, \"percentile_inc_nulls\": 0.9296587109565735, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9287670254707336, \"percentile_inc_nulls\": 0.9288511872291565, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9271673560142517, \"percentile_inc_nulls\": 0.9272533655166626, \"value_count\": 92, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9267718195915222, \"percentile_inc_nulls\": 0.9268583059310913, \"value_count\": 91, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9263805747032166, \"percentile_inc_nulls\": 0.9264675378799438, \"value_count\": 90, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9259936809539795, \"percentile_inc_nulls\": 0.926081120967865, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9256155490875244, \"percentile_inc_nulls\": 0.9257033467292786, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9244940280914307, \"percentile_inc_nulls\": 0.9245831966400146, \"value_count\": 86, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.924124538898468, \"percentile_inc_nulls\": 0.9242141246795654, \"value_count\": 85, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9230291247367859, \"percentile_inc_nulls\": 0.9231200218200684, \"value_count\": 84, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9226683378219604, \"percentile_inc_nulls\": 0.9227596521377563, \"value_count\": 83, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9209295511245728, \"percentile_inc_nulls\": 0.921022891998291, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.920586109161377, \"percentile_inc_nulls\": 0.9206799268722534, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9192299246788025, \"percentile_inc_nulls\": 0.9193252921104431, \"value_count\": 78, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9185605049133301, \"percentile_inc_nulls\": 0.9186566472053528, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9172390103340149, \"percentile_inc_nulls\": 0.9173367619514465, \"value_count\": 76, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9169129729270935, \"percentile_inc_nulls\": 0.9170111417770386, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9162870645523071, \"percentile_inc_nulls\": 0.9163858890533447, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9150525331497192, \"percentile_inc_nulls\": 0.9151528477668762, \"value_count\": 71, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9147482514381409, \"percentile_inc_nulls\": 0.9148489236831665, \"value_count\": 70, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9141483306884766, \"percentile_inc_nulls\": 0.9142497181892395, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9135571718215942, \"percentile_inc_nulls\": 0.9136592745780945, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9132659435272217, \"percentile_inc_nulls\": 0.9133683443069458, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9129833579063416, \"percentile_inc_nulls\": 0.9130861163139343, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9118705987930298, \"percentile_inc_nulls\": 0.9119746685028076, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.911322832107544, \"percentile_inc_nulls\": 0.9114275574684143, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9107925295829773, \"percentile_inc_nulls\": 0.9108978509902954, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9100100994110107, \"percentile_inc_nulls\": 0.9101163744926453, \"value_count\": 60, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9092406630516052, \"percentile_inc_nulls\": 0.9093478918075562, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9089885354042053, \"percentile_inc_nulls\": 0.9090960621833801, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9079974889755249, \"percentile_inc_nulls\": 0.9081060886383057, \"value_count\": 57, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.907267153263092, \"percentile_inc_nulls\": 0.9073767066001892, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9060717821121216, \"percentile_inc_nulls\": 0.9061827063560486, \"value_count\": 55, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9046633839607239, \"percentile_inc_nulls\": 0.9047759771347046, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9035331606864929, \"percentile_inc_nulls\": 0.9036471247673035, \"value_count\": 52, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9022030234336853, \"percentile_inc_nulls\": 0.90231853723526, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9013336300849915, \"percentile_inc_nulls\": 0.9014501571655273, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.9000556468963623, \"percentile_inc_nulls\": 0.9001736640930176, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8992210626602173, \"percentile_inc_nulls\": 0.899340033531189, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8981994986534119, \"percentile_inc_nulls\": 0.8983197212219238, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8965998291969299, \"percentile_inc_nulls\": 0.8967219591140747, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8942524790763855, \"percentile_inc_nulls\": 0.8943774104118347, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8934874534606934, \"percentile_inc_nulls\": 0.8936132192611694, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8916182518005371, \"percentile_inc_nulls\": 0.8917462825775146, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.890340268611908, \"percentile_inc_nulls\": 0.8904697895050049, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8883798122406006, \"percentile_inc_nulls\": 0.8885116577148438, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 451.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8862932920455933, \"percentile_inc_nulls\": 0.8864275813102722, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8842589259147644, \"percentile_inc_nulls\": 0.8843955993652344, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.882607102394104, \"percentile_inc_nulls\": 0.8827457427978516, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8798729181289673, \"percentile_inc_nulls\": 0.8800147771835327, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 629.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8773690462112427, \"percentile_inc_nulls\": 0.8775138854980469, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8755433559417725, \"percentile_inc_nulls\": 0.8756903409957886, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8730308413505554, \"percentile_inc_nulls\": 0.8731808066368103, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 578.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8700184226036072, \"percentile_inc_nulls\": 0.8701719045639038, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 693.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8672363758087158, \"percentile_inc_nulls\": 0.8673931956291199, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8641370534896851, \"percentile_inc_nulls\": 0.8642975091934204, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 713.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8542260527610779, \"percentile_inc_nulls\": 0.854398250579834, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2280.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8430066704750061, \"percentile_inc_nulls\": 0.8431921005249023, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2581.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8309569954872131, \"percentile_inc_nulls\": 0.8311566710472107, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2772.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8173424601554871, \"percentile_inc_nulls\": 0.8175581693649292, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3132.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.8058144450187683, \"percentile_inc_nulls\": 0.8060437440872192, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2652.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7959251999855042, \"percentile_inc_nulls\": 0.796166181564331, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2275.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7849709987640381, \"percentile_inc_nulls\": 0.7852249145507812, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2520.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7717736959457397, \"percentile_inc_nulls\": 0.7720432281494141, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3036.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7584808468818665, \"percentile_inc_nulls\": 0.7587660551071167, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3058.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7432361841201782, \"percentile_inc_nulls\": 0.7435394525527954, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3507.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7268048524856567, \"percentile_inc_nulls\": 0.7271274328231812, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3780.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.7090476751327515, \"percentile_inc_nulls\": 0.7093912959098816, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4085.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6908167004585266, \"percentile_inc_nulls\": 0.6911818385124207, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4194.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.670790433883667, \"percentile_inc_nulls\": 0.6711792349815369, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4607.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6495774984359741, \"percentile_inc_nulls\": 0.6499912738800049, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 4880.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.6246044635772705, \"percentile_inc_nulls\": 0.6250478029251099, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5745.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5996531248092651, \"percentile_inc_nulls\": 0.6001259088516235, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 5740.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5716241598129272, \"percentile_inc_nulls\": 0.5721300840377808, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 6448.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5411088466644287, \"percentile_inc_nulls\": 0.5416507720947266, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7020.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.5080678462982178, \"percentile_inc_nulls\": 0.5086488723754883, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 7601.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.4703800678253174, \"percentile_inc_nulls\": 0.47100555896759033, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 8670.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.42902785539627075, \"percentile_inc_nulls\": 0.42970216274261475, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9513.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.38677579164505005, \"percentile_inc_nulls\": 0.38749998807907104, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9720.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.3392161726951599, \"percentile_inc_nulls\": 0.3399965167045593, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 10941.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.2879660129547119, \"percentile_inc_nulls\": 0.28880685567855835, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 11790.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.23104310035705566, \"percentile_inc_nulls\": 0.23195117712020874, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 13095.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.16950809955596924, \"percentile_inc_nulls\": 0.1704888939857483, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14156.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.10641694068908691, \"percentile_inc_nulls\": 0.1074722409248352, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14514.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.04255199432373047, \"percentile_inc_nulls\": 0.04368269443511963, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 14692.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0011809468269348145, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 9789.0, \"distinct_value_count\": 40089}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 906, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 906.0, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 272 values (0.1%) are null and there are 40089 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 906, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 744, \"group_name\": \"_street_address_\", \"value\": \"301 south college street\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 613, \"group_name\": \"_street_address_\", \"value\": \"388 greenwich st\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 586, \"group_name\": \"_street_address_\", \"value\": \"711 high street\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 473, \"group_name\": \"_street_address_\", \"value\": \"world financial center\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 467, \"group_name\": \"_street_address_\", \"value\": \"c/o state street bank & trust co\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 435, \"group_name\": \"_street_address_\", \"value\": \"lehman abs corp\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 429, \"group_name\": \"_street_address_\", \"value\": \"one international place\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 407, \"group_name\": \"_street_address_\", \"value\": \"383 madison avenue\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"8943 fullbright ave\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"400 e vine st ste 300\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"8000 maryland ave ste 1190\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"one international place ste 520\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"6855 south red road ste 400\", \"total_non_null_rows\": 230048, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 40089}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 906]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.99008709192276, \"percentile_inc_nulls\": 0.9901137351989746, \"value_count\": 2277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2277.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9816457033157349, \"percentile_inc_nulls\": 0.9816950559616089, \"value_count\": 1939, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1939.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.973717987537384, \"percentile_inc_nulls\": 0.9737886190414429, \"value_count\": 1821, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1821.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9667393565177917, \"percentile_inc_nulls\": 0.9668287634849548, \"value_count\": 1603, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1603.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9604964852333069, \"percentile_inc_nulls\": 0.9606026411056519, \"value_count\": 1434, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1434.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9545148015022278, \"percentile_inc_nulls\": 0.9546370506286621, \"value_count\": 1374, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1374.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9485548734664917, \"percentile_inc_nulls\": 0.9486930966377258, \"value_count\": 1369, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1369.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9427255392074585, \"percentile_inc_nulls\": 0.9428794980049133, \"value_count\": 1339, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1339.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.937832236289978, \"percentile_inc_nulls\": 0.9379993081092834, \"value_count\": 1124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1124.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.933365523815155, \"percentile_inc_nulls\": 0.9335446357727051, \"value_count\": 1026, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1026.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9293820858001709, \"percentile_inc_nulls\": 0.9295718669891357, \"value_count\": 915, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 915.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9254465699195862, \"percentile_inc_nulls\": 0.9256469011306763, \"value_count\": 904, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 904.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9215763211250305, \"percentile_inc_nulls\": 0.9217870831489563, \"value_count\": 889, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 889.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9181152582168579, \"percentile_inc_nulls\": 0.9183353781700134, \"value_count\": 795, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 795.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9117678999900818, \"percentile_inc_nulls\": 0.9120050668716431, \"value_count\": 729, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1458.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.908716082572937, \"percentile_inc_nulls\": 0.9089614152908325, \"value_count\": 701, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 701.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9057644605636597, \"percentile_inc_nulls\": 0.9060177206993103, \"value_count\": 678, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 678.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9029172658920288, \"percentile_inc_nulls\": 0.9031782150268555, \"value_count\": 654, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 654.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.9000918865203857, \"percentile_inc_nulls\": 0.9003603458404541, \"value_count\": 649, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8973099589347839, \"percentile_inc_nulls\": 0.8975859880447388, \"value_count\": 639, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 639.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8918071985244751, \"percentile_inc_nulls\": 0.8920979499816895, \"value_count\": 632, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1264.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8890731930732727, \"percentile_inc_nulls\": 0.8893713355064392, \"value_count\": 628, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 628.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8864045143127441, \"percentile_inc_nulls\": 0.8867098093032837, \"value_count\": 613, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8837401866912842, \"percentile_inc_nulls\": 0.8840526342391968, \"value_count\": 612, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 612.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.88108891248703, \"percentile_inc_nulls\": 0.8814084529876709, \"value_count\": 609, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 609.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8786247968673706, \"percentile_inc_nulls\": 0.8789510130882263, \"value_count\": 566, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 566.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.876182496547699, \"percentile_inc_nulls\": 0.87651526927948, \"value_count\": 561, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 561.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8737488985061646, \"percentile_inc_nulls\": 0.8740882277488708, \"value_count\": 559, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 559.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.87142413854599, \"percentile_inc_nulls\": 0.8717697262763977, \"value_count\": 534, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 534.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8691777586936951, \"percentile_inc_nulls\": 0.8695293664932251, \"value_count\": 516, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8669357299804688, \"percentile_inc_nulls\": 0.8672933578491211, \"value_count\": 515, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 515.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.864698052406311, \"percentile_inc_nulls\": 0.8650616407394409, \"value_count\": 514, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 514.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8624733686447144, \"percentile_inc_nulls\": 0.8628430366516113, \"value_count\": 511, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8580589294433594, \"percentile_inc_nulls\": 0.8584403991699219, \"value_count\": 507, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1014.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8558691740036011, \"percentile_inc_nulls\": 0.8562564849853516, \"value_count\": 503, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 503.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.853775143623352, \"percentile_inc_nulls\": 0.8541681170463562, \"value_count\": 481, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 481.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8516898155212402, \"percentile_inc_nulls\": 0.8520883917808533, \"value_count\": 479, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 479.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8496524095535278, \"percentile_inc_nulls\": 0.8500564098358154, \"value_count\": 468, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8476279973983765, \"percentile_inc_nulls\": 0.8480374813079834, \"value_count\": 465, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8456079959869385, \"percentile_inc_nulls\": 0.84602290391922, \"value_count\": 464, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8436489105224609, \"percentile_inc_nulls\": 0.8440691232681274, \"value_count\": 450, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8398004174232483, \"percentile_inc_nulls\": 0.8402310013771057, \"value_count\": 442, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 884.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8378892540931702, \"percentile_inc_nulls\": 0.8383249044418335, \"value_count\": 439, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 439.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8360172510147095, \"percentile_inc_nulls\": 0.8364579677581787, \"value_count\": 430, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8341495990753174, \"percentile_inc_nulls\": 0.8345953226089478, \"value_count\": 429, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8322862982749939, \"percentile_inc_nulls\": 0.8327370882034302, \"value_count\": 428, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8304448127746582, \"percentile_inc_nulls\": 0.8309004902839661, \"value_count\": 423, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8286119699478149, \"percentile_inc_nulls\": 0.8290725946426392, \"value_count\": 421, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 421.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8267922401428223, \"percentile_inc_nulls\": 0.8272577524185181, \"value_count\": 418, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8249855041503906, \"percentile_inc_nulls\": 0.825455904006958, \"value_count\": 415, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 415.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.823187530040741, \"percentile_inc_nulls\": 0.8236627578735352, \"value_count\": 413, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 413.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8213939070701599, \"percentile_inc_nulls\": 0.8218739032745361, \"value_count\": 412, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8178240060806274, \"percentile_inc_nulls\": 0.8183136582374573, \"value_count\": 410, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.816056489944458, \"percentile_inc_nulls\": 0.8165508508682251, \"value_count\": 406, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.812617301940918, \"percentile_inc_nulls\": 0.8131208419799805, \"value_count\": 395, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 790.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8109150528907776, \"percentile_inc_nulls\": 0.8114232420921326, \"value_count\": 391, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 391.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8092389702796936, \"percentile_inc_nulls\": 0.8097516298294067, \"value_count\": 385, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8059042096138, \"percentile_inc_nulls\": 0.8064258098602295, \"value_count\": 383, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 766.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8025955557823181, \"percentile_inc_nulls\": 0.8031260967254639, \"value_count\": 380, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.8009586334228516, \"percentile_inc_nulls\": 0.8014935851097107, \"value_count\": 376, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7993347644805908, \"percentile_inc_nulls\": 0.7998740673065186, \"value_count\": 373, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 373.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7977153062820435, \"percentile_inc_nulls\": 0.7982589602470398, \"value_count\": 372, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7961306571960449, \"percentile_inc_nulls\": 0.7966785430908203, \"value_count\": 364, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7945720553398132, \"percentile_inc_nulls\": 0.7951241731643677, \"value_count\": 358, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 358.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7930309772491455, \"percentile_inc_nulls\": 0.7935872077941895, \"value_count\": 354, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7915289998054504, \"percentile_inc_nulls\": 0.7920892834663391, \"value_count\": 345, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7900357246398926, \"percentile_inc_nulls\": 0.790600061416626, \"value_count\": 343, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7885729670524597, \"percentile_inc_nulls\": 0.7891411781311035, \"value_count\": 336, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7871232628822327, \"percentile_inc_nulls\": 0.7876954078674316, \"value_count\": 333, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7856822609901428, \"percentile_inc_nulls\": 0.7862582206726074, \"value_count\": 331, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 331.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7842717170715332, \"percentile_inc_nulls\": 0.7848514914512634, \"value_count\": 324, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7828916907310486, \"percentile_inc_nulls\": 0.7834751605987549, \"value_count\": 317, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 317.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7801402807235718, \"percentile_inc_nulls\": 0.7807311415672302, \"value_count\": 316, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 632.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7787688970565796, \"percentile_inc_nulls\": 0.7793635129928589, \"value_count\": 315, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7774062752723694, \"percentile_inc_nulls\": 0.77800452709198, \"value_count\": 313, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7760480046272278, \"percentile_inc_nulls\": 0.7766498923301697, \"value_count\": 312, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.774702787399292, \"percentile_inc_nulls\": 0.7753082513809204, \"value_count\": 309, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7733619213104248, \"percentile_inc_nulls\": 0.7739709615707397, \"value_count\": 308, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7720253467559814, \"percentile_inc_nulls\": 0.7726380825042725, \"value_count\": 307, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 307.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7706975936889648, \"percentile_inc_nulls\": 0.7713138461112976, \"value_count\": 305, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 305.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7681115865707397, \"percentile_inc_nulls\": 0.7687348127365112, \"value_count\": 297, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7668229341506958, \"percentile_inc_nulls\": 0.7674496173858643, \"value_count\": 296, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7642543911933899, \"percentile_inc_nulls\": 0.7648879885673523, \"value_count\": 295, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7629787921905518, \"percentile_inc_nulls\": 0.7636158466339111, \"value_count\": 293, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 293.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7617163062095642, \"percentile_inc_nulls\": 0.7623567581176758, \"value_count\": 290, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.759269654750824, \"percentile_inc_nulls\": 0.7599166631698608, \"value_count\": 281, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.756831705570221, \"percentile_inc_nulls\": 0.7574852705001831, \"value_count\": 280, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7556257843971252, \"percentile_inc_nulls\": 0.7562825679779053, \"value_count\": 277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7544373273849487, \"percentile_inc_nulls\": 0.7550972700119019, \"value_count\": 273, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.753253161907196, \"percentile_inc_nulls\": 0.7539162635803223, \"value_count\": 272, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7497268319129944, \"percentile_inc_nulls\": 0.7503994703292847, \"value_count\": 270, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.747410774230957, \"percentile_inc_nulls\": 0.748089611530304, \"value_count\": 266, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7462570667266846, \"percentile_inc_nulls\": 0.7469390630722046, \"value_count\": 265, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7451338768005371, \"percentile_inc_nulls\": 0.7458188533782959, \"value_count\": 258, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7440193891525269, \"percentile_inc_nulls\": 0.7447073459625244, \"value_count\": 256, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7429092526435852, \"percentile_inc_nulls\": 0.7436002492904663, \"value_count\": 255, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7418078184127808, \"percentile_inc_nulls\": 0.7425017356872559, \"value_count\": 253, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 253.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7407151460647583, \"percentile_inc_nulls\": 0.7414119243621826, \"value_count\": 251, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7396267652511597, \"percentile_inc_nulls\": 0.7403265237808228, \"value_count\": 250, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.737476110458374, \"percentile_inc_nulls\": 0.7381816506385803, \"value_count\": 247, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7353428602218628, \"percentile_inc_nulls\": 0.7360541820526123, \"value_count\": 245, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7332184314727783, \"percentile_inc_nulls\": 0.7339353561401367, \"value_count\": 244, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7321605086326599, \"percentile_inc_nulls\": 0.7328803539276123, \"value_count\": 243, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7311112880706787, \"percentile_inc_nulls\": 0.7318339347839355, \"value_count\": 241, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 241.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7300664782524109, \"percentile_inc_nulls\": 0.7307919263839722, \"value_count\": 240, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7279855012893677, \"percentile_inc_nulls\": 0.7287166118621826, \"value_count\": 239, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 478.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7269493937492371, \"percentile_inc_nulls\": 0.7276831865310669, \"value_count\": 238, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7259175777435303, \"percentile_inc_nulls\": 0.7266542315483093, \"value_count\": 237, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7248901724815369, \"percentile_inc_nulls\": 0.7256295680999756, \"value_count\": 236, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7238671183586121, \"percentile_inc_nulls\": 0.7246092557907104, \"value_count\": 235, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7218557596206665, \"percentile_inc_nulls\": 0.7226033210754395, \"value_count\": 231, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7198618650436401, \"percentile_inc_nulls\": 0.7206147909164429, \"value_count\": 229, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 458.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7188736796379089, \"percentile_inc_nulls\": 0.719629168510437, \"value_count\": 227, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 227.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7178941369056702, \"percentile_inc_nulls\": 0.7186523079872131, \"value_count\": 225, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7169276475906372, \"percentile_inc_nulls\": 0.7176884412765503, \"value_count\": 222, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7159655094146729, \"percentile_inc_nulls\": 0.716728925704956, \"value_count\": 221, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7150077819824219, \"percentile_inc_nulls\": 0.7157737016677856, \"value_count\": 220, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7131009101867676, \"percentile_inc_nulls\": 0.7138720154762268, \"value_count\": 219, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 438.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.709304690361023, \"percentile_inc_nulls\": 0.7100859880447388, \"value_count\": 218, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 872.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.708368718624115, \"percentile_inc_nulls\": 0.7091524600982666, \"value_count\": 215, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7065054178237915, \"percentile_inc_nulls\": 0.707294225692749, \"value_count\": 214, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.705578088760376, \"percentile_inc_nulls\": 0.7063694000244141, \"value_count\": 213, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7028093338012695, \"percentile_inc_nulls\": 0.7036080360412598, \"value_count\": 212, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7009808421134949, \"percentile_inc_nulls\": 0.7017844915390015, \"value_count\": 210, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.7000709772109985, \"percentile_inc_nulls\": 0.7008770704269409, \"value_count\": 209, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6982599496841431, \"percentile_inc_nulls\": 0.6990708708763123, \"value_count\": 208, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6973587274551392, \"percentile_inc_nulls\": 0.6981720924377441, \"value_count\": 207, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6955651044845581, \"percentile_inc_nulls\": 0.6963832974433899, \"value_count\": 206, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6937975883483887, \"percentile_inc_nulls\": 0.6946204900741577, \"value_count\": 203, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6929181814193726, \"percentile_inc_nulls\": 0.6937434673309326, \"value_count\": 202, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.692043125629425, \"percentile_inc_nulls\": 0.6928707957267761, \"value_count\": 201, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6911724805831909, \"percentile_inc_nulls\": 0.6920024156570435, \"value_count\": 200, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6894571781158447, \"percentile_inc_nulls\": 0.6902917623519897, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 394.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6877505779266357, \"percentile_inc_nulls\": 0.6885898113250732, \"value_count\": 196, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6869059801101685, \"percentile_inc_nulls\": 0.6877474784851074, \"value_count\": 194, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6860701441764832, \"percentile_inc_nulls\": 0.6869138479232788, \"value_count\": 192, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6835756301879883, \"percentile_inc_nulls\": 0.6844260096549988, \"value_count\": 191, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 573.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.682752788066864, \"percentile_inc_nulls\": 0.683605432510376, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6819343566894531, \"percentile_inc_nulls\": 0.6827892065048218, \"value_count\": 188, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6794919967651367, \"percentile_inc_nulls\": 0.6803534030914307, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 561.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.676253080368042, \"percentile_inc_nulls\": 0.6771231293678284, \"value_count\": 186, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6754476428031921, \"percentile_inc_nulls\": 0.6763198971748352, \"value_count\": 185, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6746509671211243, \"percentile_inc_nulls\": 0.6755253672599792, \"value_count\": 183, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.673858642578125, \"percentile_inc_nulls\": 0.6747351884841919, \"value_count\": 182, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6722826957702637, \"percentile_inc_nulls\": 0.6731634140014648, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6715034246444702, \"percentile_inc_nulls\": 0.6723862886428833, \"value_count\": 179, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 179.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6699622869491577, \"percentile_inc_nulls\": 0.6708492636680603, \"value_count\": 177, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6684298515319824, \"percentile_inc_nulls\": 0.6693209409713745, \"value_count\": 176, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6661442518234253, \"percentile_inc_nulls\": 0.667041540145874, \"value_count\": 175, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6646292209625244, \"percentile_inc_nulls\": 0.6655305624008179, \"value_count\": 174, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6638760566711426, \"percentile_inc_nulls\": 0.6647794246673584, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6623871922492981, \"percentile_inc_nulls\": 0.6632945537567139, \"value_count\": 171, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6586867570877075, \"percentile_inc_nulls\": 0.6596040725708008, \"value_count\": 170, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 850.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6572152376174927, \"percentile_inc_nulls\": 0.6581364870071411, \"value_count\": 169, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6564838886260986, \"percentile_inc_nulls\": 0.6574070453643799, \"value_count\": 168, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6543027758598328, \"percentile_inc_nulls\": 0.6552318334579468, \"value_count\": 167, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 501.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6528661251068115, \"percentile_inc_nulls\": 0.6537990570068359, \"value_count\": 165, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6514555811882019, \"percentile_inc_nulls\": 0.6523923277854919, \"value_count\": 162, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6507546901702881, \"percentile_inc_nulls\": 0.6516932845115662, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6493615508079529, \"percentile_inc_nulls\": 0.6503039598464966, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6479771137237549, \"percentile_inc_nulls\": 0.6489232778549194, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6472892761230469, \"percentile_inc_nulls\": 0.6482372283935547, \"value_count\": 158, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6452388167381287, \"percentile_inc_nulls\": 0.6461922526359558, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6445596814155579, \"percentile_inc_nulls\": 0.645514965057373, \"value_count\": 156, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6425353288650513, \"percentile_inc_nulls\": 0.643496036529541, \"value_count\": 155, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6405240297317505, \"percentile_inc_nulls\": 0.64149010181427, \"value_count\": 154, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6392005681991577, \"percentile_inc_nulls\": 0.6401702165603638, \"value_count\": 152, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6385431289672852, \"percentile_inc_nulls\": 0.6395145654678345, \"value_count\": 151, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6365840435028076, \"percentile_inc_nulls\": 0.6375607848167419, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6352954506874084, \"percentile_inc_nulls\": 0.6362756490707397, \"value_count\": 148, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6340242624282837, \"percentile_inc_nulls\": 0.6350078582763672, \"value_count\": 146, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 292.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.63213050365448, \"percentile_inc_nulls\": 0.6331191062927246, \"value_count\": 145, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6315035820007324, \"percentile_inc_nulls\": 0.6324939131736755, \"value_count\": 144, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6302584409713745, \"percentile_inc_nulls\": 0.6312521696090698, \"value_count\": 143, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6290220618247986, \"percentile_inc_nulls\": 0.6300190687179565, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6271805763244629, \"percentile_inc_nulls\": 0.6281825304031372, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6253520846366882, \"percentile_inc_nulls\": 0.6263589859008789, \"value_count\": 140, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.62354975938797, \"percentile_inc_nulls\": 0.6245614886283875, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6223568916320801, \"percentile_inc_nulls\": 0.6233718395233154, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6193965077400208, \"percentile_inc_nulls\": 0.6204193830490112, \"value_count\": 136, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6170456409454346, \"percentile_inc_nulls\": 0.618074893951416, \"value_count\": 135, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.616462230682373, \"percentile_inc_nulls\": 0.6174930334091187, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6141462326049805, \"percentile_inc_nulls\": 0.6151832342147827, \"value_count\": 133, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6135715246200562, \"percentile_inc_nulls\": 0.6146100759506226, \"value_count\": 132, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6130012273788452, \"percentile_inc_nulls\": 0.6140413284301758, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6124353408813477, \"percentile_inc_nulls\": 0.6134768724441528, \"value_count\": 130, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6113120913505554, \"percentile_inc_nulls\": 0.6123567223548889, \"value_count\": 129, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6090831160545349, \"percentile_inc_nulls\": 0.6101337671279907, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 512.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6068715453147888, \"percentile_inc_nulls\": 0.6079280972480774, \"value_count\": 127, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 508.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.605774462223053, \"percentile_inc_nulls\": 0.6068339347839355, \"value_count\": 126, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6046860814094543, \"percentile_inc_nulls\": 0.6057485342025757, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6036064624786377, \"percentile_inc_nulls\": 0.6046717166900635, \"value_count\": 124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.6019999980926514, \"percentile_inc_nulls\": 0.6030696630477905, \"value_count\": 123, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 369.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5988132357597351, \"percentile_inc_nulls\": 0.5998914241790771, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 732.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5977597236633301, \"percentile_inc_nulls\": 0.5988407135009766, \"value_count\": 121, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.596192479133606, \"percentile_inc_nulls\": 0.5972777009010315, \"value_count\": 120, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5941201448440552, \"percentile_inc_nulls\": 0.5952110290527344, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 476.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5936064720153809, \"percentile_inc_nulls\": 0.5946986675262451, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5915690660476685, \"percentile_inc_nulls\": 0.592666745185852, \"value_count\": 117, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5895664691925049, \"percentile_inc_nulls\": 0.5906695127487183, \"value_count\": 115, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5875812768936157, \"percentile_inc_nulls\": 0.5886896848678589, \"value_count\": 114, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 456.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.585613489151001, \"percentile_inc_nulls\": 0.5867271423339844, \"value_count\": 113, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5841506719589233, \"percentile_inc_nulls\": 0.5852683186531067, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5817345380783081, \"percentile_inc_nulls\": 0.5828586220741272, \"value_count\": 111, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 555.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5788612365722656, \"percentile_inc_nulls\": 0.5799930095672607, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 660.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5764885544776917, \"percentile_inc_nulls\": 0.5776268243789673, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 545.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5746078491210938, \"percentile_inc_nulls\": 0.5757511258125305, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 432.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5722787380218506, \"percentile_inc_nulls\": 0.5734282732009888, \"value_count\": 107, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 535.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5704329013824463, \"percentile_inc_nulls\": 0.571587324142456, \"value_count\": 106, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 424.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5681473016738892, \"percentile_inc_nulls\": 0.5693079233169556, \"value_count\": 105, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5663362741470337, \"percentile_inc_nulls\": 0.5675017237663269, \"value_count\": 104, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5645425915718079, \"percentile_inc_nulls\": 0.5657129287719727, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5623223185539246, \"percentile_inc_nulls\": 0.5634986162185669, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5601238012313843, \"percentile_inc_nulls\": 0.5613059997558594, \"value_count\": 101, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 505.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5588177442550659, \"percentile_inc_nulls\": 0.5600034594535828, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5570937991142273, \"percentile_inc_nulls\": 0.5582841634750366, \"value_count\": 99, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5549606084823608, \"percentile_inc_nulls\": 0.5561566352844238, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.552426815032959, \"percentile_inc_nulls\": 0.5536297559738159, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5490833520889282, \"percentile_inc_nulls\": 0.5502952337265015, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 768.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5466018915176392, \"percentile_inc_nulls\": 0.5478204488754272, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5441465377807617, \"percentile_inc_nulls\": 0.5453716516494751, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 564.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5425270199775696, \"percentile_inc_nulls\": 0.5437564849853516, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.540524423122406, \"percentile_inc_nulls\": 0.5417592525482178, \"value_count\": 92, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5385435819625854, \"percentile_inc_nulls\": 0.539783775806427, \"value_count\": 91, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5361927151679993, \"percentile_inc_nulls\": 0.537439227104187, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5338679552078247, \"percentile_inc_nulls\": 0.5351207256317139, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 534.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5315693020820618, \"percentile_inc_nulls\": 0.5328282117843628, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5300543308258057, \"percentile_inc_nulls\": 0.5313172936439514, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5289310812950134, \"percentile_inc_nulls\": 0.5301971435546875, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5248606204986572, \"percentile_inc_nulls\": 0.5261375904083252, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 935.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5215693712234497, \"percentile_inc_nulls\": 0.5228551626205444, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5186786651611328, \"percentile_inc_nulls\": 0.5199722051620483, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 664.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5158227682113647, \"percentile_inc_nulls\": 0.5171239972114563, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 656.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5130016803741455, \"percentile_inc_nulls\": 0.5143105387687683, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 648.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5102154612541199, \"percentile_inc_nulls\": 0.5115317702293396, \"value_count\": 80, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5074640512466431, \"percentile_inc_nulls\": 0.5087877511978149, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 632.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5047475099563599, \"percentile_inc_nulls\": 0.5060784816741943, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.5027361512184143, \"percentile_inc_nulls\": 0.5040726065635681, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4990966320037842, \"percentile_inc_nulls\": 0.5004428625106812, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 836.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.49583154916763306, \"percentile_inc_nulls\": 0.497186541557312, \"value_count\": 75, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 750.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.492287814617157, \"percentile_inc_nulls\": 0.4936522841453552, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4897453784942627, \"percentile_inc_nulls\": 0.4911167025566101, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48723775148391724, \"percentile_inc_nulls\": 0.48861581087112427, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48414677381515503, \"percentile_inc_nulls\": 0.4855331778526306, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 710.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48353731632232666, \"percentile_inc_nulls\": 0.4849253296852112, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.48233574628829956, \"percentile_inc_nulls\": 0.48372697830200195, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47937536239624023, \"percentile_inc_nulls\": 0.48077458143234253, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47733360528945923, \"percentile_inc_nulls\": 0.4787382483482361, \"value_count\": 67, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 469.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47474759817123413, \"percentile_inc_nulls\": 0.4761592745780945, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.47304975986480713, \"percentile_inc_nulls\": 0.4744659662246704, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.46914905309677124, \"percentile_inc_nulls\": 0.470575749874115, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 896.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4655835032463074, \"percentile_inc_nulls\": 0.46701979637145996, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 819.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4620746374130249, \"percentile_inc_nulls\": 0.463520348072052, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 806.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45782560110092163, \"percentile_inc_nulls\": 0.4592827558517456, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 976.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45390748977661133, \"percentile_inc_nulls\": 0.45537513494491577, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.45185261964797974, \"percentile_inc_nulls\": 0.4533258080482483, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 472.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.44831758737564087, \"percentile_inc_nulls\": 0.4498002529144287, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 812.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4448435306549072, \"percentile_inc_nulls\": 0.4463355541229248, \"value_count\": 57, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4411866068840027, \"percentile_inc_nulls\": 0.4426884055137634, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.43807387351989746, \"percentile_inc_nulls\": 0.43958407640457153, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 715.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4350177049636841, \"percentile_inc_nulls\": 0.43653613328933716, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 702.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.43086445331573486, \"percentile_inc_nulls\": 0.4323940873146057, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 954.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4249785542488098, \"percentile_inc_nulls\": 0.42652398347854614, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4220921993255615, \"percentile_inc_nulls\": 0.42364537715911865, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4181740880012512, \"percentile_inc_nulls\": 0.4197377562522888, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4136943221092224, \"percentile_inc_nulls\": 0.41527003049850464, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1029.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.41055983304977417, \"percentile_inc_nulls\": 0.4121439456939697, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.40605831146240234, \"percentile_inc_nulls\": 0.4076545834541321, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1034.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.4018528461456299, \"percentile_inc_nulls\": 0.40346038341522217, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 966.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.39773881435394287, \"percentile_inc_nulls\": 0.3993574380874634, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 945.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.39390772581100464, \"percentile_inc_nulls\": 0.3955366611480713, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3896021246910095, \"percentile_inc_nulls\": 0.3912426233291626, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 989.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3866766095161438, \"percentile_inc_nulls\": 0.38832491636276245, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 672.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.38096481561660767, \"percentile_inc_nulls\": 0.38262850046157837, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1312.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3760889172554016, \"percentile_inc_nulls\": 0.3777657151222229, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1120.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3715047240257263, \"percentile_inc_nulls\": 0.37319380044937134, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1053.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.36488741636276245, \"percentile_inc_nulls\": 0.3665943145751953, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1520.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3584442138671875, \"percentile_inc_nulls\": 0.36016845703125, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1480.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.35389918088912964, \"percentile_inc_nulls\": 0.3556356430053711, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1044.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.35039466619491577, \"percentile_inc_nulls\": 0.35214048624038696, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 805.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.34521400928497314, \"percentile_inc_nulls\": 0.3469737768173218, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1190.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3393237590789795, \"percentile_inc_nulls\": 0.3410993218421936, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1353.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.33291542530059814, \"percentile_inc_nulls\": 0.33470821380615234, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1472.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3269772529602051, \"percentile_inc_nulls\": 0.3287860155105591, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1364.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.3180961608886719, \"percentile_inc_nulls\": 0.31992876529693604, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2040.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.30850106477737427, \"percentile_inc_nulls\": 0.31035947799682617, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2204.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2969207763671875, \"percentile_inc_nulls\": 0.29881036281585693, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2660.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2848137617111206, \"percentile_inc_nulls\": 0.2867358326911926, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2781.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.276550829410553, \"percentile_inc_nulls\": 0.2784951329231262, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1898.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.2687145471572876, \"percentile_inc_nulls\": 0.27067995071411133, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.25753480195999146, \"percentile_inc_nulls\": 0.259530246257782, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2568.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.24732154607772827, \"percentile_inc_nulls\": 0.24934440851211548, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2346.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.23803120851516724, \"percentile_inc_nulls\": 0.24007904529571533, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2134.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.22779178619384766, \"percentile_inc_nulls\": 0.22986716032028198, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2352.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.21899771690368652, \"percentile_inc_nulls\": 0.22109675407409668, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2020.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.20865821838378906, \"percentile_inc_nulls\": 0.21078497171401978, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2375.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.19792252779006958, \"percentile_inc_nulls\": 0.20007812976837158, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2466.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.18689513206481934, \"percentile_inc_nulls\": 0.18908041715621948, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2533.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.1774219274520874, \"percentile_inc_nulls\": 0.17963266372680664, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2176.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.16514509916305542, \"percentile_inc_nulls\": 0.16738885641098022, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2820.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.15679514408111572, \"percentile_inc_nulls\": 0.15906131267547607, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1918.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.1475135087966919, \"percentile_inc_nulls\": 0.14980459213256836, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2132.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.13732635974884033, \"percentile_inc_nulls\": 0.13964486122131348, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2340.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.12607258558273315, \"percentile_inc_nulls\": 0.12842130661010742, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2585.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.11405694484710693, \"percentile_inc_nulls\": 0.11643797159194946, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2760.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.10151892900466919, \"percentile_inc_nulls\": 0.10393363237380981, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2880.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.08891123533248901, \"percentile_inc_nulls\": 0.09135985374450684, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2896.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.07592916488647461, \"percentile_inc_nulls\": 0.07841265201568604, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2982.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.06292092800140381, \"percentile_inc_nulls\": 0.06543940305709839, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2988.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.049229204654693604, \"percentile_inc_nulls\": 0.0517844557762146, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3145.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.034880101680755615, \"percentile_inc_nulls\": 0.037473976612091064, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3296.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.02078789472579956, \"percentile_inc_nulls\": 0.023419618606567383, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 3237.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.008188903331756592, \"percentile_inc_nulls\": 0.010854482650756836, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2894.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026875734329223633, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 1881.0, \"distinct_value_count\": 11612}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2277, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"sum_tokens_in_value_count_group\": 2277.0, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 619 values (0.3%) are null and there are 11612 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2277, \"group_name\": \"_zip_code_\", \"value\": \"10022\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1939, \"group_name\": \"_zip_code_\", \"value\": \"10036\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1821, \"group_name\": \"_zip_code_\", \"value\": \"10019\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1603, \"group_name\": \"_zip_code_\", \"value\": \"00000\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1434, \"group_name\": \"_zip_code_\", \"value\": \"02110\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1374, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1369, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1339, \"group_name\": \"_zip_code_\", \"value\": \"10013\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1124, \"group_name\": \"_zip_code_\", \"value\": \"92121\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1026, \"group_name\": \"_zip_code_\", \"value\": \"91302\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"55446-0106\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"75140\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"79550\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"06410\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"98327\", \"total_non_null_rows\": 229701, \"total_rows_inc_nulls\": 230320, \"distinct_value_count\": 11612}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2277]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9998449683189392, \"percentile_inc_nulls\": 0.9998449683189392, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9993489980697632, \"percentile_inc_nulls\": 0.9993489980697632, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9980469346046448, \"percentile_inc_nulls\": 0.9980469942092896, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9937686920166016, \"percentile_inc_nulls\": 0.9937688708305359, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9755553007125854, \"percentile_inc_nulls\": 0.9755560755729675, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1175.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9293010830879211, \"percentile_inc_nulls\": 0.9293032884597778, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2984.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.806442141532898, \"percentile_inc_nulls\": 0.8064481019973755, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7926.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.535085916519165, \"percentile_inc_nulls\": 0.5351003408432007, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 17506.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 3.0994415283203125e-05, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 34520.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 2 values (0.0%) are null and there are 46959 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"acacia research corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"brandywine realty trust\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"citigroup incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"central european media enterprises limited\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"evolent health incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"united guardian incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"cue health incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"gs mortgage securities trust 2020 gsa2\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"accolade incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8313045501708984, \"percentile_inc_nulls\": 0.8315895795822144, \"value_count\": 10865, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10865.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6936154961585999, \"percentile_inc_nulls\": 0.6941331624984741, \"value_count\": 8868, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 8868.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6153929829597473, \"percentile_inc_nulls\": 0.6160427927970886, \"value_count\": 5038, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 5038.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5645126104354858, \"percentile_inc_nulls\": 0.5652483701705933, \"value_count\": 3277, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3277.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.525261640548706, \"percentile_inc_nulls\": 0.5260636806488037, \"value_count\": 2528, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2528.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4904046058654785, \"percentile_inc_nulls\": 0.49126559495925903, \"value_count\": 2245, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2245.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4588702917098999, \"percentile_inc_nulls\": 0.4597845673561096, \"value_count\": 2031, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2031.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4301152229309082, \"percentile_inc_nulls\": 0.4310780167579651, \"value_count\": 1852, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1852.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.40471386909484863, \"percentile_inc_nulls\": 0.4057195782661438, \"value_count\": 1636, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1636.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.379716157913208, \"percentile_inc_nulls\": 0.3807641863822937, \"value_count\": 1610, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1610.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.35474956035614014, \"percentile_inc_nulls\": 0.35583972930908203, \"value_count\": 1608, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1608.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3316926956176758, \"percentile_inc_nulls\": 0.3328218460083008, \"value_count\": 1485, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1485.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.30921030044555664, \"percentile_inc_nulls\": 0.31037741899490356, \"value_count\": 1448, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1448.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2887929677963257, \"percentile_inc_nulls\": 0.28999459743499756, \"value_count\": 1315, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1315.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.26895010471343994, \"percentile_inc_nulls\": 0.2701852321624756, \"value_count\": 1278, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1278.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.25052011013031006, \"percentile_inc_nulls\": 0.25178641080856323, \"value_count\": 1187, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1187.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.23381364345550537, \"percentile_inc_nulls\": 0.2351081371307373, \"value_count\": 1076, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.21730893850326538, \"percentile_inc_nulls\": 0.2186313271522522, \"value_count\": 1063, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.20310217142105103, \"percentile_inc_nulls\": 0.20444858074188232, \"value_count\": 915, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 915.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.19002890586853027, \"percentile_inc_nulls\": 0.19139736890792847, \"value_count\": 842, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 842.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.17740583419799805, \"percentile_inc_nulls\": 0.17879563570022583, \"value_count\": 813, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 813.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.16577649116516113, \"percentile_inc_nulls\": 0.16718590259552002, \"value_count\": 749, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 749.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.15436452627182007, \"percentile_inc_nulls\": 0.15579324960708618, \"value_count\": 735, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 735.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.14396172761917114, \"percentile_inc_nulls\": 0.145408034324646, \"value_count\": 670, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 670.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.134443998336792, \"percentile_inc_nulls\": 0.1359063982963562, \"value_count\": 613, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12548518180847168, \"percentile_inc_nulls\": 0.12696272134780884, \"value_count\": 577, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 577.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.11661958694458008, \"percentile_inc_nulls\": 0.11811208724975586, \"value_count\": 571, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 571.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.10901159048080444, \"percentile_inc_nulls\": 0.11051690578460693, \"value_count\": 490, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1015588641166687, \"percentile_inc_nulls\": 0.10307681560516357, \"value_count\": 480, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09496009349822998, \"percentile_inc_nulls\": 0.09648919105529785, \"value_count\": 425, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08944821357727051, \"percentile_inc_nulls\": 0.09098660945892334, \"value_count\": 355, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08441758155822754, \"percentile_inc_nulls\": 0.08596450090408325, \"value_count\": 324, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0800391435623169, \"percentile_inc_nulls\": 0.08159345388412476, \"value_count\": 282, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07572275400161743, \"percentile_inc_nulls\": 0.07728433609008789, \"value_count\": 278, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0716392993927002, \"percentile_inc_nulls\": 0.0732077956199646, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 263.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06772661209106445, \"percentile_inc_nulls\": 0.06930172443389893, \"value_count\": 252, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06382948160171509, \"percentile_inc_nulls\": 0.06541115045547485, \"value_count\": 251, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06002545356750488, \"percentile_inc_nulls\": 0.06161355972290039, \"value_count\": 245, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05636119842529297, \"percentile_inc_nulls\": 0.05795550346374512, \"value_count\": 236, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.050181686878204346, \"percentile_inc_nulls\": 0.05178642272949219, \"value_count\": 199, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 398.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04723161458969116, \"percentile_inc_nulls\": 0.04884135723114014, \"value_count\": 190, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04459214210510254, \"percentile_inc_nulls\": 0.04620629549026489, \"value_count\": 170, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.042309701442718506, \"percentile_inc_nulls\": 0.043927788734436035, \"value_count\": 147, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.040089428424835205, \"percentile_inc_nulls\": 0.04171121120452881, \"value_count\": 143, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03811758756637573, \"percentile_inc_nulls\": 0.03974270820617676, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03622335195541382, \"percentile_inc_nulls\": 0.037851691246032715, \"value_count\": 122, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.034375667572021484, \"percentile_inc_nulls\": 0.0360071063041687, \"value_count\": 119, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03262120485305786, \"percentile_inc_nulls\": 0.03425562381744385, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03092879056930542, \"percentile_inc_nulls\": 0.032566070556640625, \"value_count\": 109, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.029251933097839355, \"percentile_inc_nulls\": 0.030892014503479004, \"value_count\": 108, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.027590572834014893, \"percentile_inc_nulls\": 0.02923351526260376, \"value_count\": 107, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.026162147521972656, \"percentile_inc_nulls\": 0.02780747413635254, \"value_count\": 92, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.024826884269714355, \"percentile_inc_nulls\": 0.026474475860595703, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.023600280284881592, \"percentile_inc_nulls\": 0.025249958038330078, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.022389233112335205, \"percentile_inc_nulls\": 0.024040937423706055, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.02119368314743042, \"percentile_inc_nulls\": 0.022847414016723633, \"value_count\": 77, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.020060241222381592, \"percentile_inc_nulls\": 0.021715879440307617, \"value_count\": 73, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 73.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.019051015377044678, \"percentile_inc_nulls\": 0.02070838212966919, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.017156779766082764, \"percentile_inc_nulls\": 0.01881730556488037, \"value_count\": 61, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.015324652194976807, \"percentile_inc_nulls\": 0.016988277435302734, \"value_count\": 59, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.013647794723510742, \"percentile_inc_nulls\": 0.015314280986785889, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.01287144422531128, \"percentile_inc_nulls\": 0.014539241790771484, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.012203812599182129, \"percentile_inc_nulls\": 0.013872742652893066, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.011551737785339355, \"percentile_inc_nulls\": 0.01322174072265625, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010433793067932129, \"percentile_inc_nulls\": 0.012105703353881836, \"value_count\": 36, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.009921431541442871, \"percentile_inc_nulls\": 0.011594176292419434, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00947117805480957, \"percentile_inc_nulls\": 0.011144697666168213, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008694827556610107, \"percentile_inc_nulls\": 0.010369658470153809, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008337736129760742, \"percentile_inc_nulls\": 0.010013163089752197, \"value_count\": 23, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007996141910552979, \"percentile_inc_nulls\": 0.009672164916992188, \"value_count\": 22, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007685601711273193, \"percentile_inc_nulls\": 0.00936216115951538, \"value_count\": 20, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007390618324279785, \"percentile_inc_nulls\": 0.009067654609680176, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0071111321449279785, \"percentile_inc_nulls\": 0.008788645267486572, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006847202777862549, \"percentile_inc_nulls\": 0.00852513313293457, \"value_count\": 17, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006598770618438721, \"percentile_inc_nulls\": 0.00827711820602417, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005946636199951172, \"percentile_inc_nulls\": 0.0076261162757873535, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005542933940887451, \"percentile_inc_nulls\": 0.0072231292724609375, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0045182108879089355, \"percentile_inc_nulls\": 0.006200134754180908, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004052400588989258, \"percentile_inc_nulls\": 0.0057350993156433105, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003912687301635742, \"percentile_inc_nulls\": 0.0055956244468688965, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003664255142211914, \"percentile_inc_nulls\": 0.005347609519958496, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0031208395957946777, \"percentile_inc_nulls\": 0.004805088043212891, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0025618672370910645, \"percentile_inc_nulls\": 0.004247069358825684, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0018631815910339355, \"percentile_inc_nulls\": 0.0035495758056640625, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0011799931526184082, \"percentile_inc_nulls\": 0.002867579460144043, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0007142424583435059, \"percentile_inc_nulls\": 0.0024025440216064453, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00034159421920776367, \"percentile_inc_nulls\": 0.0020305514335632324, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0016895532608032227, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10865, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10865.0, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 109 values (0.2%) are null and there are 173 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10865, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 8868, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 5038, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 3277, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2528, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2245, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2031, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1852, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1636, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1610, \"group_name\": \"_state_\", \"value\": \"co\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"lo\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"h9\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10865]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.89018714427948, \"percentile_inc_nulls\": 0.8903045654296875, \"value_count\": 7077, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7077.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8659808039665222, \"percentile_inc_nulls\": 0.866124153137207, \"value_count\": 1560, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1560.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8488967418670654, \"percentile_inc_nulls\": 0.8490583896636963, \"value_count\": 1101, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1101.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8348074555397034, \"percentile_inc_nulls\": 0.8349841237068176, \"value_count\": 908, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 908.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8216956853866577, \"percentile_inc_nulls\": 0.821886420249939, \"value_count\": 845, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 845.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8089252710342407, \"percentile_inc_nulls\": 0.8091296553611755, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7962014675140381, \"percentile_inc_nulls\": 0.7964194416999817, \"value_count\": 820, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7720572352409363, \"percentile_inc_nulls\": 0.7723010182380676, \"value_count\": 778, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7602488994598389, \"percentile_inc_nulls\": 0.7605053186416626, \"value_count\": 761, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 761.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7490767240524292, \"percentile_inc_nulls\": 0.7493451237678528, \"value_count\": 720, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7294944524765015, \"percentile_inc_nulls\": 0.7297837734222412, \"value_count\": 631, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1262.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7197343111038208, \"percentile_inc_nulls\": 0.720034122467041, \"value_count\": 629, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 629.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7100052833557129, \"percentile_inc_nulls\": 0.710315465927124, \"value_count\": 627, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7003227472305298, \"percentile_inc_nulls\": 0.7006433010101318, \"value_count\": 624, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6910591721534729, \"percentile_inc_nulls\": 0.6913895606994629, \"value_count\": 597, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 597.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6818732023239136, \"percentile_inc_nulls\": 0.6822134256362915, \"value_count\": 592, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6750768423080444, \"percentile_inc_nulls\": 0.675424337387085, \"value_count\": 438, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 438.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6684821844100952, \"percentile_inc_nulls\": 0.6688367128372192, \"value_count\": 425, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6620736718177795, \"percentile_inc_nulls\": 0.6624350547790527, \"value_count\": 413, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 413.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6557738184928894, \"percentile_inc_nulls\": 0.656141996383667, \"value_count\": 406, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6501877307891846, \"percentile_inc_nulls\": 0.6505619287490845, \"value_count\": 360, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6447724103927612, \"percentile_inc_nulls\": 0.6451523303985596, \"value_count\": 349, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6395586729049683, \"percentile_inc_nulls\": 0.6399441957473755, \"value_count\": 336, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6344846487045288, \"percentile_inc_nulls\": 0.6348755955696106, \"value_count\": 327, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6294417381286621, \"percentile_inc_nulls\": 0.6298379898071289, \"value_count\": 325, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6245849132537842, \"percentile_inc_nulls\": 0.6249864101409912, \"value_count\": 313, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6198056936264038, \"percentile_inc_nulls\": 0.6202123165130615, \"value_count\": 308, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6151506900787354, \"percentile_inc_nulls\": 0.6155622601509094, \"value_count\": 300, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6105421781539917, \"percentile_inc_nulls\": 0.6109586954116821, \"value_count\": 297, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6062284708023071, \"percentile_inc_nulls\": 0.60664963722229, \"value_count\": 278, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6020078659057617, \"percentile_inc_nulls\": 0.6024335622787476, \"value_count\": 272, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5978803634643555, \"percentile_inc_nulls\": 0.5983104705810547, \"value_count\": 266, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5937684178352356, \"percentile_inc_nulls\": 0.5942028760910034, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5898736715316772, \"percentile_inc_nulls\": 0.5903123617172241, \"value_count\": 251, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.586103081703186, \"percentile_inc_nulls\": 0.5865457653999329, \"value_count\": 243, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5823790431022644, \"percentile_inc_nulls\": 0.5828256607055664, \"value_count\": 240, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5750240087509155, \"percentile_inc_nulls\": 0.5754785537719727, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.571424126625061, \"percentile_inc_nulls\": 0.5718824863433838, \"value_count\": 232, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5678397417068481, \"percentile_inc_nulls\": 0.5683019161224365, \"value_count\": 231, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5643794536590576, \"percentile_inc_nulls\": 0.5648453831672668, \"value_count\": 223, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 223.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5610278248786926, \"percentile_inc_nulls\": 0.5614973306655884, \"value_count\": 216, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5543556213378906, \"percentile_inc_nulls\": 0.5548322200775146, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5510504841804504, \"percentile_inc_nulls\": 0.551530659198761, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.547947108745575, \"percentile_inc_nulls\": 0.5484305620193481, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5448592901229858, \"percentile_inc_nulls\": 0.5453460216522217, \"value_count\": 199, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5419421195983887, \"percentile_inc_nulls\": 0.5424319505691528, \"value_count\": 188, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5390404462814331, \"percentile_inc_nulls\": 0.5395334362983704, \"value_count\": 187, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5361698269844055, \"percentile_inc_nulls\": 0.5366659164428711, \"value_count\": 185, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5334388613700867, \"percentile_inc_nulls\": 0.5339378118515015, \"value_count\": 176, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.530769944190979, \"percentile_inc_nulls\": 0.5312718152999878, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5281320810317993, \"percentile_inc_nulls\": 0.5286367535591125, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5255252122879028, \"percentile_inc_nulls\": 0.5260326862335205, \"value_count\": 168, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5229494571685791, \"percentile_inc_nulls\": 0.5234596729278564, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5204356908798218, \"percentile_inc_nulls\": 0.5209486484527588, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5179375410079956, \"percentile_inc_nulls\": 0.518453061580658, \"value_count\": 161, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5154547691345215, \"percentile_inc_nulls\": 0.5159730315208435, \"value_count\": 160, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5105204582214355, \"percentile_inc_nulls\": 0.511043906211853, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5056171417236328, \"percentile_inc_nulls\": 0.5061458349227905, \"value_count\": 158, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5007448196411133, \"percentile_inc_nulls\": 0.5012787580490112, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 314.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.49352943897247314, \"percentile_inc_nulls\": 0.4940711259841919, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.49113988876342773, \"percentile_inc_nulls\": 0.49168407917022705, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4865468740463257, \"percentile_inc_nulls\": 0.48709601163864136, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.48426592350006104, \"percentile_inc_nulls\": 0.4848175048828125, \"value_count\": 147, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.47973495721817017, \"percentile_inc_nulls\": 0.4802914261817932, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 292.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4775471091270447, \"percentile_inc_nulls\": 0.4781058430671692, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.475405752658844, \"percentile_inc_nulls\": 0.47596681118011475, \"value_count\": 138, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4711541533470154, \"percentile_inc_nulls\": 0.47171974182128906, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4690438508987427, \"percentile_inc_nulls\": 0.4696117043495178, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4649474024772644, \"percentile_inc_nulls\": 0.46551966667175293, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4608820080757141, \"percentile_inc_nulls\": 0.46145856380462646, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 262.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45886480808258057, \"percentile_inc_nulls\": 0.4594435691833496, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4568631052970886, \"percentile_inc_nulls\": 0.4574440121650696, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45490801334381104, \"percentile_inc_nulls\": 0.45549094676971436, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45298391580581665, \"percentile_inc_nulls\": 0.4535689353942871, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.44919776916503906, \"percentile_inc_nulls\": 0.44978684186935425, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.44733577966690063, \"percentile_inc_nulls\": 0.4479268193244934, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4437047839164734, \"percentile_inc_nulls\": 0.44429975748062134, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4401359558105469, \"percentile_inc_nulls\": 0.4407346844673157, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.43838250637054443, \"percentile_inc_nulls\": 0.4389832019805908, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4349067211151123, \"percentile_inc_nulls\": 0.43551111221313477, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.43149304389953613, \"percentile_inc_nulls\": 0.4321010708808899, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42980170249938965, \"percentile_inc_nulls\": 0.43041151762008667, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42812585830688477, \"percentile_inc_nulls\": 0.4287375211715698, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42646557092666626, \"percentile_inc_nulls\": 0.4270789623260498, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4248983860015869, \"percentile_inc_nulls\": 0.42551344633102417, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4233776926994324, \"percentile_inc_nulls\": 0.42399442195892334, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42036741971969604, \"percentile_inc_nulls\": 0.4209873676300049, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41890883445739746, \"percentile_inc_nulls\": 0.41953033208847046, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41602271795272827, \"percentile_inc_nulls\": 0.4166473150253296, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41461068391799927, \"percentile_inc_nulls\": 0.41523677110671997, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41321414709091187, \"percentile_inc_nulls\": 0.41384172439575195, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.410514235496521, \"percentile_inc_nulls\": 0.4111446738243103, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.40784531831741333, \"percentile_inc_nulls\": 0.40847867727279663, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4065263867378235, \"percentile_inc_nulls\": 0.4071611166000366, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.40522295236587524, \"percentile_inc_nulls\": 0.405859112739563, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4013592600822449, \"percentile_inc_nulls\": 0.4019995331764221, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39881449937820435, \"percentile_inc_nulls\": 0.39945751428604126, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3975576162338257, \"percentile_inc_nulls\": 0.39820194244384766, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3963162899017334, \"percentile_inc_nulls\": 0.39696192741394043, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39386463165283203, \"percentile_inc_nulls\": 0.3945128917694092, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39023369550704956, \"percentile_inc_nulls\": 0.3908858299255371, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3878440856933594, \"percentile_inc_nulls\": 0.38849878311157227, \"value_count\": 77, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.38666480779647827, \"percentile_inc_nulls\": 0.38732075691223145, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.38317352533340454, \"percentile_inc_nulls\": 0.38383322954177856, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3809080719947815, \"percentile_inc_nulls\": 0.38157016038894653, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3797908425331116, \"percentile_inc_nulls\": 0.3804541826248169, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3775874376296997, \"percentile_inc_nulls\": 0.37825310230255127, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.37650126218795776, \"percentile_inc_nulls\": 0.37716811895370483, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3754305839538574, \"percentile_inc_nulls\": 0.3760985732078552, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.37226516008377075, \"percentile_inc_nulls\": 0.37293654680252075, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3712255358695984, \"percentile_inc_nulls\": 0.37189799547195435, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36917728185653687, \"percentile_inc_nulls\": 0.3698519468307495, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3661980628967285, \"percentile_inc_nulls\": 0.36687594652175903, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36522048711776733, \"percentile_inc_nulls\": 0.36589938402175903, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36425846815109253, \"percentile_inc_nulls\": 0.3649383783340454, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36236536502838135, \"percentile_inc_nulls\": 0.36304736137390137, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3605033755302429, \"percentile_inc_nulls\": 0.3611873388290405, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35501039028167725, \"percentile_inc_nulls\": 0.3557002544403076, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35321044921875, \"percentile_inc_nulls\": 0.3539022207260132, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3523414731025696, \"percentile_inc_nulls\": 0.35303419828414917, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35148805379867554, \"percentile_inc_nulls\": 0.35218167304992676, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 55.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34813642501831055, \"percentile_inc_nulls\": 0.3488336205482483, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34649163484573364, \"percentile_inc_nulls\": 0.3471905589103699, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34245723485946655, \"percentile_inc_nulls\": 0.3431605100631714, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3361263871192932, \"percentile_inc_nulls\": 0.33683639764785767, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3330230116844177, \"percentile_inc_nulls\": 0.3337363600730896, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.32846105098724365, \"percentile_inc_nulls\": 0.3291792869567871, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3262265920639038, \"percentile_inc_nulls\": 0.3269472122192383, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.32330942153930664, \"percentile_inc_nulls\": 0.32403314113616943, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.31974053382873535, \"percentile_inc_nulls\": 0.32046812772750854, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3155509829521179, \"percentile_inc_nulls\": 0.31628304719924927, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.31213730573654175, \"percentile_inc_nulls\": 0.3128729462623596, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3101356029510498, \"percentile_inc_nulls\": 0.31087344884872437, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3075287938117981, \"percentile_inc_nulls\": 0.3082693815231323, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3030754327774048, \"percentile_inc_nulls\": 0.30382078886032104, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29935139417648315, \"percentile_inc_nulls\": 0.30010074377059937, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2987462282180786, \"percentile_inc_nulls\": 0.29949623346328735, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29697734117507935, \"percentile_inc_nulls\": 0.2977291941642761, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29295843839645386, \"percentile_inc_nulls\": 0.29371464252471924, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.28960680961608887, \"percentile_inc_nulls\": 0.29036659002304077, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.287434458732605, \"percentile_inc_nulls\": 0.2881965637207031, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.28426897525787354, \"percentile_inc_nulls\": 0.2850344777107239, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.281196653842926, \"percentile_inc_nulls\": 0.281965434551239, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2782174348831177, \"percentile_inc_nulls\": 0.27898937463760376, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2743692398071289, \"percentile_inc_nulls\": 0.2751452922821045, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2692486643791199, \"percentile_inc_nulls\": 0.2700302004814148, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2669987082481384, \"percentile_inc_nulls\": 0.26778268814086914, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2630884647369385, \"percentile_inc_nulls\": 0.26387661695480347, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.25889891386032104, \"percentile_inc_nulls\": 0.2596915364265442, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2552679777145386, \"percentile_inc_nulls\": 0.2560644745826721, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2502250075340271, \"percentile_inc_nulls\": 0.25102686882019043, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.24426651000976562, \"percentile_inc_nulls\": 0.2450748085975647, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2389131784439087, \"percentile_inc_nulls\": 0.2397271990776062, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.23447537422180176, \"percentile_inc_nulls\": 0.23529410362243652, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2289358377456665, \"percentile_inc_nulls\": 0.2297605276107788, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 357.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.22428077459335327, \"percentile_inc_nulls\": 0.22511041164398193, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.21691030263900757, \"percentile_inc_nulls\": 0.21774780750274658, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 475.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.21216213703155518, \"percentile_inc_nulls\": 0.2130047082901001, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.20794153213500977, \"percentile_inc_nulls\": 0.20878863334655762, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2014865279197693, \"percentile_inc_nulls\": 0.20234054327011108, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.19380569458007812, \"percentile_inc_nulls\": 0.19466793537139893, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.18620240688323975, \"percentile_inc_nulls\": 0.18707275390625, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.17853707075119019, \"percentile_inc_nulls\": 0.17941564321517944, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.17071658372879028, \"percentile_inc_nulls\": 0.1716035008430481, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.16269433498382568, \"percentile_inc_nulls\": 0.16358983516693115, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 517.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.15384972095489502, \"percentile_inc_nulls\": 0.15475469827651978, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1442137360572815, \"percentile_inc_nulls\": 0.1451290249824524, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1342829465866089, \"percentile_inc_nulls\": 0.1352088451385498, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.12331253290176392, \"percentile_inc_nulls\": 0.12425017356872559, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 707.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1114886999130249, \"percentile_inc_nulls\": 0.11243897676467896, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 762.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.09736835956573486, \"percentile_inc_nulls\": 0.09833371639251709, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 910.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.08272039890289307, \"percentile_inc_nulls\": 0.083701491355896, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 944.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.06396055221557617, \"percentile_inc_nulls\": 0.06496161222457886, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1209.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.038668036460876465, \"percentile_inc_nulls\": 0.03969621658325195, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1630.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0010695457458496094, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2492.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 7077, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7077.0, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 69 values (0.1%) are null and there are 5261 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7077, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1560, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1101, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 908, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 845, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 823, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 820, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 778, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 778, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 761, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"new prague\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"vallejo\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"watseka\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"temple\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"elk city\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 7077]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9876123666763306, \"percentile_inc_nulls\": 0.9876307845115662, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9790434241294861, \"percentile_inc_nulls\": 0.9790746569633484, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.973858654499054, \"percentile_inc_nulls\": 0.9738975167274475, \"value_count\": 334, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 334.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9691861271858215, \"percentile_inc_nulls\": 0.9692319631576538, \"value_count\": 301, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 301.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9648861289024353, \"percentile_inc_nulls\": 0.9649384021759033, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.960710346698761, \"percentile_inc_nulls\": 0.9607688188552856, \"value_count\": 269, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 269.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9570778608322144, \"percentile_inc_nulls\": 0.9571417570114136, \"value_count\": 234, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9534919857978821, \"percentile_inc_nulls\": 0.9535611867904663, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9500147700309753, \"percentile_inc_nulls\": 0.9500890970230103, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9466617107391357, \"percentile_inc_nulls\": 0.9467410445213318, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9434173107147217, \"percentile_inc_nulls\": 0.9435015320777893, \"value_count\": 209, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9403126239776611, \"percentile_inc_nulls\": 0.9404014348983765, \"value_count\": 200, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9372545480728149, \"percentile_inc_nulls\": 0.9373478889465332, \"value_count\": 197, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9343982338905334, \"percentile_inc_nulls\": 0.9344958662986755, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9317126870155334, \"percentile_inc_nulls\": 0.9318143129348755, \"value_count\": 173, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.929104745388031, \"percentile_inc_nulls\": 0.9292102456092834, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9266365766525269, \"percentile_inc_nulls\": 0.9267457127571106, \"value_count\": 159, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9242925047874451, \"percentile_inc_nulls\": 0.9244051575660706, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9222744703292847, \"percentile_inc_nulls\": 0.9223901629447937, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9203029870986938, \"percentile_inc_nulls\": 0.9204216003417969, \"value_count\": 127, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9183470606803894, \"percentile_inc_nulls\": 0.9184685945510864, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9164066314697266, \"percentile_inc_nulls\": 0.9165310263633728, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9146214723587036, \"percentile_inc_nulls\": 0.91474848985672, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9129294157028198, \"percentile_inc_nulls\": 0.9130589962005615, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.911268413066864, \"percentile_inc_nulls\": 0.9114004373550415, \"value_count\": 107, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9096229076385498, \"percentile_inc_nulls\": 0.9097574353218079, \"value_count\": 106, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9081016778945923, \"percentile_inc_nulls\": 0.908238410949707, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9066113829612732, \"percentile_inc_nulls\": 0.9067503809928894, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9051367044448853, \"percentile_inc_nulls\": 0.9052778482437134, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9036774635314941, \"percentile_inc_nulls\": 0.903820812702179, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9022338390350342, \"percentile_inc_nulls\": 0.9023792743682861, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.900805652141571, \"percentile_inc_nulls\": 0.9009532928466797, \"value_count\": 92, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8994551301002502, \"percentile_inc_nulls\": 0.8996047377586365, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.898213267326355, \"percentile_inc_nulls\": 0.8983647227287292, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8970179557800293, \"percentile_inc_nulls\": 0.8971711993217468, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8959158062934875, \"percentile_inc_nulls\": 0.896070659160614, \"value_count\": 71, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8937735557556152, \"percentile_inc_nulls\": 0.8939316272735596, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8927335143089294, \"percentile_inc_nulls\": 0.8928931355476379, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8917089700698853, \"percentile_inc_nulls\": 0.8918701410293579, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.890715479850769, \"percentile_inc_nulls\": 0.8908780813217163, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8897685408592224, \"percentile_inc_nulls\": 0.8899325728416443, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8888682126998901, \"percentile_inc_nulls\": 0.8890335559844971, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.887129545211792, \"percentile_inc_nulls\": 0.887297511100769, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8863068222999573, \"percentile_inc_nulls\": 0.8864760398864746, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8839317560195923, \"percentile_inc_nulls\": 0.8841044902801514, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8831555843353271, \"percentile_inc_nulls\": 0.883329451084137, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8823949694633484, \"percentile_inc_nulls\": 0.8825699687004089, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8816808462142944, \"percentile_inc_nulls\": 0.8818569183349609, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8810133934020996, \"percentile_inc_nulls\": 0.8811904191970825, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.880376935005188, \"percentile_inc_nulls\": 0.8805549144744873, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.879755973815918, \"percentile_inc_nulls\": 0.8799349069595337, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.877939760684967, \"percentile_inc_nulls\": 0.8781213760375977, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.877349853515625, \"percentile_inc_nulls\": 0.8775323629379272, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 38.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8762011528015137, \"percentile_inc_nulls\": 0.8763853311538696, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8745245933532715, \"percentile_inc_nulls\": 0.874711275100708, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8734379410743713, \"percentile_inc_nulls\": 0.8736262917518616, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8718545436859131, \"percentile_inc_nulls\": 0.8720452785491943, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8703177571296692, \"percentile_inc_nulls\": 0.8705106973648071, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8683307766914368, \"percentile_inc_nulls\": 0.8685266971588135, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.866887092590332, \"percentile_inc_nulls\": 0.8670851588249207, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8654899597167969, \"percentile_inc_nulls\": 0.8656901717185974, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8645896315574646, \"percentile_inc_nulls\": 0.8647911548614502, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8628510236740112, \"percentile_inc_nulls\": 0.8630551099777222, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8599171042442322, \"percentile_inc_nulls\": 0.8601255416870117, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.859513521194458, \"percentile_inc_nulls\": 0.8597225546836853, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.85679692029953, \"percentile_inc_nulls\": 0.857010006904602, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8545615673065186, \"percentile_inc_nulls\": 0.8547779321670532, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8527763485908508, \"percentile_inc_nulls\": 0.8529953956604004, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8507272601127625, \"percentile_inc_nulls\": 0.8509494066238403, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8490973114967346, \"percentile_inc_nulls\": 0.8493218421936035, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8463031053543091, \"percentile_inc_nulls\": 0.8465318083763123, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8439435362815857, \"percentile_inc_nulls\": 0.8441757559776306, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8419876098632812, \"percentile_inc_nulls\": 0.8422227501869202, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8406681418418884, \"percentile_inc_nulls\": 0.8409051895141602, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.837687611579895, \"percentile_inc_nulls\": 0.8379291296005249, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8362905383110046, \"percentile_inc_nulls\": 0.8365341424942017, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8334652781486511, \"percentile_inc_nulls\": 0.8337130546569824, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.830034613609314, \"percentile_inc_nulls\": 0.830287516117096, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8257501721382141, \"percentile_inc_nulls\": 0.8260094523429871, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8211396932601929, \"percentile_inc_nulls\": 0.8214058876037598, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8163274526596069, \"percentile_inc_nulls\": 0.8166007995605469, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8086434006690979, \"percentile_inc_nulls\": 0.8089281320571899, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8003228902816772, \"percentile_inc_nulls\": 0.8006200194358826, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 536.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7909778356552124, \"percentile_inc_nulls\": 0.7912888526916504, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 602.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7787764668464661, \"percentile_inc_nulls\": 0.7791056632995605, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7628650069236755, \"percentile_inc_nulls\": 0.7632178664207458, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1025.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7326254844665527, \"percentile_inc_nulls\": 0.7330232858657837, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1948.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.6715720891952515, \"percentile_inc_nulls\": 0.6720607280731201, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3933.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.4995110034942627, \"percentile_inc_nulls\": 0.5002557635307312, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 11084.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.001488029956817627, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 32178.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 96 values (0.1%) are null and there are 40345 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high street\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 334, \"group_name\": \"_street_address_\", \"value\": \"383 madison avenue\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 301, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lake blvd\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 269, \"group_name\": \"_street_address_\", \"value\": \"11 madison avenue\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 234, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"c/o wilmington trust company\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial center floor 10\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 216, \"group_name\": \"_street_address_\", \"value\": \"85 broad street\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2100 east 54th street north\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"900 e. old settlers boulevard, suite 100\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2400 ellis road\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"3000 olympus blvd.\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"410 monon blvd\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9859902262687683, \"percentile_inc_nulls\": 0.9860497713088989, \"value_count\": 900, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9729300737380981, \"percentile_inc_nulls\": 0.9730450510978699, \"value_count\": 839, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 839.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9513862133026123, \"percentile_inc_nulls\": 0.9515926837921143, \"value_count\": 692, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1384.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9406920671463013, \"percentile_inc_nulls\": 0.9409439563751221, \"value_count\": 687, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 687.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.932099461555481, \"percentile_inc_nulls\": 0.9323878288269043, \"value_count\": 552, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9242540001869202, \"percentile_inc_nulls\": 0.924575686454773, \"value_count\": 504, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9171868562698364, \"percentile_inc_nulls\": 0.9175385236740112, \"value_count\": 454, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 454.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9101663827896118, \"percentile_inc_nulls\": 0.9105479121208191, \"value_count\": 451, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 451.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9037219285964966, \"percentile_inc_nulls\": 0.9041308164596558, \"value_count\": 414, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8973397016525269, \"percentile_inc_nulls\": 0.8977757096290588, \"value_count\": 410, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8914711475372314, \"percentile_inc_nulls\": 0.8919321298599243, \"value_count\": 377, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 377.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8856960535049438, \"percentile_inc_nulls\": 0.8861815333366394, \"value_count\": 371, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 371.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8799831867218018, \"percentile_inc_nulls\": 0.8804929256439209, \"value_count\": 367, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 367.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8744571208953857, \"percentile_inc_nulls\": 0.8749903440475464, \"value_count\": 355, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8690711259841919, \"percentile_inc_nulls\": 0.8696272373199463, \"value_count\": 346, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 346.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8637007474899292, \"percentile_inc_nulls\": 0.8642796277999878, \"value_count\": 345, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8586105108261108, \"percentile_inc_nulls\": 0.8592110276222229, \"value_count\": 327, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8535358905792236, \"percentile_inc_nulls\": 0.8541579246520996, \"value_count\": 326, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 326.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8486013412475586, \"percentile_inc_nulls\": 0.8492443561553955, \"value_count\": 317, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 317.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8438692092895508, \"percentile_inc_nulls\": 0.8445322513580322, \"value_count\": 304, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8344670534133911, \"percentile_inc_nulls\": 0.8351701498031616, \"value_count\": 302, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 604.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8301396369934082, \"percentile_inc_nulls\": 0.8308610320091248, \"value_count\": 278, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8261702060699463, \"percentile_inc_nulls\": 0.8269084692001343, \"value_count\": 255, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8224809765815735, \"percentile_inc_nulls\": 0.8232349157333374, \"value_count\": 237, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8191341757774353, \"percentile_inc_nulls\": 0.8199023604393005, \"value_count\": 215, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8160520792007446, \"percentile_inc_nulls\": 0.8168332576751709, \"value_count\": 198, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8130165934562683, \"percentile_inc_nulls\": 0.8138107061386108, \"value_count\": 195, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8099967241287231, \"percentile_inc_nulls\": 0.8108037114143372, \"value_count\": 194, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8069924116134644, \"percentile_inc_nulls\": 0.8078121542930603, \"value_count\": 193, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8040503859519958, \"percentile_inc_nulls\": 0.8048825860023499, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8011394739151001, \"percentile_inc_nulls\": 0.8019840717315674, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7954110503196716, \"percentile_inc_nulls\": 0.7962799072265625, \"value_count\": 184, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7928581237792969, \"percentile_inc_nulls\": 0.7937378883361816, \"value_count\": 164, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7903519868850708, \"percentile_inc_nulls\": 0.7912423610687256, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7878613471984863, \"percentile_inc_nulls\": 0.7887623310089111, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7829112410545349, \"percentile_inc_nulls\": 0.7838332056999207, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7804672718048096, \"percentile_inc_nulls\": 0.781399667263031, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7757973670959473, \"percentile_inc_nulls\": 0.7767496109008789, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7735869884490967, \"percentile_inc_nulls\": 0.7745485305786133, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7691972255706787, \"percentile_inc_nulls\": 0.7701774835586548, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7648698091506958, \"percentile_inc_nulls\": 0.7658684253692627, \"value_count\": 139, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7627216577529907, \"percentile_inc_nulls\": 0.7637293338775635, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7605890035629272, \"percentile_inc_nulls\": 0.7616058588027954, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7585031390190125, \"percentile_inc_nulls\": 0.7595287561416626, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7564639449119568, \"percentile_inc_nulls\": 0.7574982643127441, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7524166703224182, \"percentile_inc_nulls\": 0.7534681558609009, \"value_count\": 130, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7484316825866699, \"percentile_inc_nulls\": 0.7495001554489136, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7446334958076477, \"percentile_inc_nulls\": 0.7457180619239807, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7427811026573181, \"percentile_inc_nulls\": 0.7438734769821167, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.739169716835022, \"percentile_inc_nulls\": 0.7402774691581726, \"value_count\": 116, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7356828451156616, \"percentile_inc_nulls\": 0.7368053793907166, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7339705228805542, \"percentile_inc_nulls\": 0.7351003885269165, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7322738170623779, \"percentile_inc_nulls\": 0.7334108352661133, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7305926084518433, \"percentile_inc_nulls\": 0.7317367792129517, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7272614240646362, \"percentile_inc_nulls\": 0.7284197807312012, \"value_count\": 107, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7256580591201782, \"percentile_inc_nulls\": 0.7268232107162476, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7224825620651245, \"percentile_inc_nulls\": 0.7236611843109131, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7209103107452393, \"percentile_inc_nulls\": 0.7220956087112427, \"value_count\": 101, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.717797040939331, \"percentile_inc_nulls\": 0.7189955711364746, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7116950154304504, \"percentile_inc_nulls\": 0.7129194736480713, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7086751461029053, \"percentile_inc_nulls\": 0.7099124193191528, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7056863903999329, \"percentile_inc_nulls\": 0.7069363594055176, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7042075991630554, \"percentile_inc_nulls\": 0.7054638862609863, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7027443647384644, \"percentile_inc_nulls\": 0.7040067911148071, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7012966871261597, \"percentile_inc_nulls\": 0.7025653123855591, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6998645663261414, \"percentile_inc_nulls\": 0.7011392712593079, \"value_count\": 92, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6956616640090942, \"percentile_inc_nulls\": 0.6969541907310486, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6942762136459351, \"percentile_inc_nulls\": 0.6955746412277222, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.690166711807251, \"percentile_inc_nulls\": 0.6914826035499573, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.686103880405426, \"percentile_inc_nulls\": 0.6874370574951172, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.682087779045105, \"percentile_inc_nulls\": 0.6834379434585571, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6767951846122742, \"percentile_inc_nulls\": 0.6781678795814514, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6741800308227539, \"percentile_inc_nulls\": 0.6755638122558594, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6715960502624512, \"percentile_inc_nulls\": 0.6729907989501953, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6690431237220764, \"percentile_inc_nulls\": 0.6704487204551697, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6665213704109192, \"percentile_inc_nulls\": 0.667937695980072, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6652916669845581, \"percentile_inc_nulls\": 0.6667131781578064, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6628632545471191, \"percentile_inc_nulls\": 0.6642951369285583, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6592674255371094, \"percentile_inc_nulls\": 0.6607145667076111, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6545352935791016, \"percentile_inc_nulls\": 0.6560025215148926, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.651079535484314, \"percentile_inc_nulls\": 0.6525614261627197, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6488068103790283, \"percentile_inc_nulls\": 0.6502983570098877, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.646565318107605, \"percentile_inc_nulls\": 0.6480663418769836, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6432496309280396, \"percentile_inc_nulls\": 0.64476478099823, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6388910412788391, \"percentile_inc_nulls\": 0.6404247283935547, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.636742889881134, \"percentile_inc_nulls\": 0.6382856369018555, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6346258521080017, \"percentile_inc_nulls\": 0.636177659034729, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6294889450073242, \"percentile_inc_nulls\": 0.6310625076293945, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6264535188674927, \"percentile_inc_nulls\": 0.6280399560928345, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6234647631645203, \"percentile_inc_nulls\": 0.6250638961791992, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6195420026779175, \"percentile_inc_nulls\": 0.6211578845977783, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6166466474533081, \"percentile_inc_nulls\": 0.6182748079299927, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6147475838661194, \"percentile_inc_nulls\": 0.6163837909698486, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6110116243362427, \"percentile_inc_nulls\": 0.612663745880127, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6100932359695435, \"percentile_inc_nulls\": 0.6117491722106934, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6055790185928345, \"percentile_inc_nulls\": 0.607254147529602, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.600348711013794, \"percentile_inc_nulls\": 0.602046012878418, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5943556427955627, \"percentile_inc_nulls\": 0.5960783958435059, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5901527404785156, \"percentile_inc_nulls\": 0.5918933153152466, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5893276929855347, \"percentile_inc_nulls\": 0.5910718441009521, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5877087712287903, \"percentile_inc_nulls\": 0.5894597768783569, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5829454660415649, \"percentile_inc_nulls\": 0.5847167372703552, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5790538787841797, \"percentile_inc_nulls\": 0.5808416604995728, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.578291118144989, \"percentile_inc_nulls\": 0.5800821781158447, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5753023624420166, \"percentile_inc_nulls\": 0.5771061182022095, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5716443061828613, \"percentile_inc_nulls\": 0.5734635591506958, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.563767671585083, \"percentile_inc_nulls\": 0.5656204223632812, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5560623407363892, \"percentile_inc_nulls\": 0.5579477548599243, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5533226728439331, \"percentile_inc_nulls\": 0.5552197098731995, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5459597110748291, \"percentile_inc_nulls\": 0.5478881001472473, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 473.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5407294034957886, \"percentile_inc_nulls\": 0.542680025100708, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5349854230880737, \"percentile_inc_nulls\": 0.5369603633880615, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 369.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5312495231628418, \"percentile_inc_nulls\": 0.5332403182983398, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5269998908042908, \"percentile_inc_nulls\": 0.5290087461471558, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5204931497573853, \"percentile_inc_nulls\": 0.5225296020507812, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5158854722976685, \"percentile_inc_nulls\": 0.5179415941238403, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5119627714157104, \"percentile_inc_nulls\": 0.5140354633331299, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5081490278244019, \"percentile_inc_nulls\": 0.5102379322052002, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5023272037506104, \"percentile_inc_nulls\": 0.5044407844543457, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.49564921855926514, \"percentile_inc_nulls\": 0.49779123067855835, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.48718106746673584, \"percentile_inc_nulls\": 0.4893590807914734, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.47897762060165405, \"percentile_inc_nulls\": 0.48119044303894043, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 527.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4687038064002991, \"percentile_inc_nulls\": 0.47096025943756104, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 660.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.46238380670547485, \"percentile_inc_nulls\": 0.4646671414375305, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4549742341041565, \"percentile_inc_nulls\": 0.4572889804840088, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 476.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.44951045513153076, \"percentile_inc_nulls\": 0.4518483877182007, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4426301121711731, \"percentile_inc_nulls\": 0.44499731063842773, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4317336082458496, \"percentile_inc_nulls\": 0.4341471195220947, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4261297583580017, \"percentile_inc_nulls\": 0.42856699228286743, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4182531237602234, \"percentile_inc_nulls\": 0.4207238554954529, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.40934914350509644, \"percentile_inc_nulls\": 0.4118577241897583, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 572.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4001961350440979, \"percentile_inc_nulls\": 0.40274351835250854, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3930355906486511, \"percentile_inc_nulls\": 0.395613431930542, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3841627836227417, \"percentile_inc_nulls\": 0.3867782950401306, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3754767179489136, \"percentile_inc_nulls\": 0.37812912464141846, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 558.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3662147521972656, \"percentile_inc_nulls\": 0.3689064383506775, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.35450881719589233, \"percentile_inc_nulls\": 0.35725027322769165, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 752.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.34353452920913696, \"percentile_inc_nulls\": 0.34632253646850586, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.33242011070251465, \"percentile_inc_nulls\": 0.33525538444519043, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.32088541984558105, \"percentile_inc_nulls\": 0.3237696886062622, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 741.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3087436556816101, \"percentile_inc_nulls\": 0.311679482460022, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.29453152418136597, \"percentile_inc_nulls\": 0.2975277304649353, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 913.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.279743492603302, \"percentile_inc_nulls\": 0.2828024625778198, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 950.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.264893114566803, \"percentile_inc_nulls\": 0.26801520586013794, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 954.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.24857956171035767, \"percentile_inc_nulls\": 0.25177091360092163, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1048.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.2263507843017578, \"percentile_inc_nulls\": 0.22963649034500122, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1428.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.20225399732589722, \"percentile_inc_nulls\": 0.20564210414886475, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1548.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.1747015118598938, \"percentile_inc_nulls\": 0.17820662260055542, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1770.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.14406687021255493, \"percentile_inc_nulls\": 0.14770209789276123, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1968.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.10684764385223389, \"percentile_inc_nulls\": 0.11064094305038452, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2391.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.05915224552154541, \"percentile_inc_nulls\": 0.06314808130264282, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3064.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.004247069358825684, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3800.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 900, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 274 values (0.4%) are null and there are 8809 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 900, \"group_name\": \"_zip_code_\", \"value\": \"91302\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 839, \"group_name\": \"_zip_code_\", \"value\": \"10019\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 692, \"group_name\": \"_zip_code_\", \"value\": \"10022\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 692, \"group_name\": \"_zip_code_\", \"value\": \"00000\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 687, \"group_name\": \"_zip_code_\", \"value\": \"10036\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 552, \"group_name\": \"_zip_code_\", \"value\": \"50392\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 504, \"group_name\": \"_zip_code_\", \"value\": \"21044\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 454, \"group_name\": \"_zip_code_\", \"value\": \"10010\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 451, \"group_name\": \"_zip_code_\", \"value\": \"55437\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 414, \"group_name\": \"_zip_code_\", \"value\": \"10281\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"56071\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"02476\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"rg41 \", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"19804\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"54403\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 900]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 210,
+     "execution_count": 131,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1819,7 +1825,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 211,
+   "execution_count": 132,
    "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
    "metadata": {},
    "outputs": [
@@ -1828,23 +1834,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed {\n",
+       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed details,\n",
-       "  #altair-viz-6022312cfc10441087e40cd1a0af0abb.vega-embed details summary {\n",
+       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed details,\n",
+       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-6022312cfc10441087e40cd1a0af0abb\"></div>\n",
+       "<div id=\"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-6022312cfc10441087e40cd1a0af0abb\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-6022312cfc10441087e40cd1a0af0abb\");\n",
+       "    if (outputDiv.id !== \"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1890,14 +1896,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9992628693580627, \"percentile_inc_nulls\": 0.9992628693580627, \"value_count\": 130, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9986108541488647, \"percentile_inc_nulls\": 0.9986108541488647, \"value_count\": 115, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9979758262634277, \"percentile_inc_nulls\": 0.9979758262634277, \"value_count\": 112, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.997369110584259, \"percentile_inc_nulls\": 0.997369110584259, \"value_count\": 107, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9968814849853516, \"percentile_inc_nulls\": 0.9968814849853516, \"value_count\": 86, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9963995218276978, \"percentile_inc_nulls\": 0.9963995218276978, \"value_count\": 85, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9959743022918701, \"percentile_inc_nulls\": 0.9959743022918701, \"value_count\": 75, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9955546855926514, \"percentile_inc_nulls\": 0.9955546855926514, \"value_count\": 74, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9951464533805847, \"percentile_inc_nulls\": 0.9951464533805847, \"value_count\": 72, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.994755208492279, \"percentile_inc_nulls\": 0.994755208492279, \"value_count\": 69, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9943753480911255, \"percentile_inc_nulls\": 0.9943753480911255, \"value_count\": 67, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9940010905265808, \"percentile_inc_nulls\": 0.9940010905265808, \"value_count\": 66, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9932640194892883, \"percentile_inc_nulls\": 0.9932640194892883, \"value_count\": 65, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9929068088531494, \"percentile_inc_nulls\": 0.9929068088531494, \"value_count\": 63, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9922150373458862, \"percentile_inc_nulls\": 0.9922150373458862, \"value_count\": 61, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9918975234031677, \"percentile_inc_nulls\": 0.9918975234031677, \"value_count\": 56, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9909619688987732, \"percentile_inc_nulls\": 0.9909619688987732, \"value_count\": 55, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.990655779838562, \"percentile_inc_nulls\": 0.990655779838562, \"value_count\": 54, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9900547862052917, \"percentile_inc_nulls\": 0.9900547862052917, \"value_count\": 53, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9897599220275879, \"percentile_inc_nulls\": 0.9897599220275879, \"value_count\": 52, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9894764423370361, \"percentile_inc_nulls\": 0.9894764423370361, \"value_count\": 50, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.989198625087738, \"percentile_inc_nulls\": 0.989198625087738, \"value_count\": 49, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9856604933738708, \"percentile_inc_nulls\": 0.9856604933738708, \"value_count\": 48, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9837950468063354, \"percentile_inc_nulls\": 0.9837950468063354, \"value_count\": 47, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 329.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9827517867088318, \"percentile_inc_nulls\": 0.9827517867088318, \"value_count\": 46, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9817311763763428, \"percentile_inc_nulls\": 0.9817311763763428, \"value_count\": 45, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.981481671333313, \"percentile_inc_nulls\": 0.981481671333313, \"value_count\": 44, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9809940457344055, \"percentile_inc_nulls\": 0.9809940457344055, \"value_count\": 43, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9805178046226501, \"percentile_inc_nulls\": 0.9805178046226501, \"value_count\": 42, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 41, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9791343212127686, \"percentile_inc_nulls\": 0.9791343212127686, \"value_count\": 40, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9786920547485352, \"percentile_inc_nulls\": 0.9786920547485352, \"value_count\": 39, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9761065244674683, \"percentile_inc_nulls\": 0.9761065244674683, \"value_count\": 38, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 456.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9750575423240662, \"percentile_inc_nulls\": 0.9750575423240662, \"value_count\": 37, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9746493101119995, \"percentile_inc_nulls\": 0.9746493101119995, \"value_count\": 36, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9728632569313049, \"percentile_inc_nulls\": 0.9728632569313049, \"value_count\": 35, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9717065691947937, \"percentile_inc_nulls\": 0.9717065691947937, \"value_count\": 34, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9700225591659546, \"percentile_inc_nulls\": 0.9700225591659546, \"value_count\": 33, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9682081341743469, \"percentile_inc_nulls\": 0.9682081341743469, \"value_count\": 32, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9597711563110352, \"percentile_inc_nulls\": 0.9597711563110352, \"value_count\": 31, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1488.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9567093253135681, \"percentile_inc_nulls\": 0.9567093253135681, \"value_count\": 30, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9550650119781494, \"percentile_inc_nulls\": 0.9550650119781494, \"value_count\": 29, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9520485997200012, \"percentile_inc_nulls\": 0.9520485997200012, \"value_count\": 28, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 532.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9500584006309509, \"percentile_inc_nulls\": 0.9500584006309509, \"value_count\": 27, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.948879063129425, \"percentile_inc_nulls\": 0.948879063129425, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.9470362663269043, \"percentile_inc_nulls\": 0.9470362663269043, \"value_count\": 25, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.7761189937591553, \"percentile_inc_nulls\": 0.7761189937591553, \"value_count\": 24, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 30144.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.7446900606155396, \"percentile_inc_nulls\": 0.7446900606155396, \"value_count\": 23, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5543.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.692049503326416, \"percentile_inc_nulls\": 0.692049503326416, \"value_count\": 22, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 9284.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6740698218345642, \"percentile_inc_nulls\": 0.6740698218345642, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3171.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6567195653915405, \"percentile_inc_nulls\": 0.6567195653915405, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3060.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6309719681739807, \"percentile_inc_nulls\": 0.6309719681739807, \"value_count\": 19, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4541.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.6099475026130676, \"percentile_inc_nulls\": 0.6099475026130676, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3708.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5923080444335938, \"percentile_inc_nulls\": 0.5923080444335938, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3111.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5730752944946289, \"percentile_inc_nulls\": 0.5730752944946289, \"value_count\": 16, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3392.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.5208543539047241, \"percentile_inc_nulls\": 0.5208543539047241, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 9210.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.4969608783721924, \"percentile_inc_nulls\": 0.4969608783721924, \"value_count\": 14, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4214.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.4662973880767822, \"percentile_inc_nulls\": 0.4662973880767822, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5408.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.41703617572784424, \"percentile_inc_nulls\": 0.41703617572784424, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8688.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.37624597549438477, \"percentile_inc_nulls\": 0.37624597549438477, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7194.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.3343445062637329, \"percentile_inc_nulls\": 0.3343445062637329, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7390.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.2747921943664551, \"percentile_inc_nulls\": 0.2747921943664551, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 10503.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.23614531755447388, \"percentile_inc_nulls\": 0.23614531755447388, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6816.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.1329507827758789, \"percentile_inc_nulls\": 0.1329507827758789, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 18200.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.10240066051483154, \"percentile_inc_nulls\": 0.10240066051483154, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5388.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.060612618923187256, \"percentile_inc_nulls\": 0.060612618923187256, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7370.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.03430366516113281, \"percentile_inc_nulls\": 0.03430366516113281, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4640.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.015694618225097656, \"percentile_inc_nulls\": 0.015694618225097656, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3282.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.004479348659515381, \"percentile_inc_nulls\": 0.004479348659515381, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1978.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 790.0, \"distinct_value_count\": 17785}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 130, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 17785 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 130, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 115, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 112, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 107, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 86, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 85, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 75, \"group_name\": \"_company_name_\", \"value\": \"tri county electric coop, incorporated\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 74, \"group_name\": \"_company_name_\", \"value\": \"stone container corporation\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 72, \"group_name\": \"_company_name_\", \"value\": \"marshall city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 69, \"group_name\": \"_company_name_\", \"value\": \"burlington city of\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"reliability design and development limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"anole energy storage, limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"northumberland solar i, limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"ny cdg genesee 4 limited liability company\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"eni new energy us, incorporated\", \"total_non_null_rows\": 176366, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 17785}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 130]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9043566584587097, \"percentile_inc_nulls\": 0.9151197075843811, \"value_count\": 14970, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 14970.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.821810781955719, \"percentile_inc_nulls\": 0.8418629169464111, \"value_count\": 12920, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12920.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7686542868614197, \"percentile_inc_nulls\": 0.79468834400177, \"value_count\": 8320, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8320.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7240015268325806, \"percentile_inc_nulls\": 0.7550604939460754, \"value_count\": 6989, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6989.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6845366954803467, \"percentile_inc_nulls\": 0.7200367450714111, \"value_count\": 6177, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6177.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6467840671539307, \"percentile_inc_nulls\": 0.6865325570106506, \"value_count\": 5909, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5909.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6090570688247681, \"percentile_inc_nulls\": 0.6530510187149048, \"value_count\": 5905, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5905.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5756553411483765, \"percentile_inc_nulls\": 0.6234081387519836, \"value_count\": 5228, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5228.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5448348522186279, \"percentile_inc_nulls\": 0.5960559248924255, \"value_count\": 4824, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4824.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5146595239639282, \"percentile_inc_nulls\": 0.5692763924598694, \"value_count\": 4723, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4723.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4875510334968567, \"percentile_inc_nulls\": 0.5452184677124023, \"value_count\": 4243, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4243.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.46147751808166504, \"percentile_inc_nulls\": 0.5220791101455688, \"value_count\": 4081, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4081.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4364837408065796, \"percentile_inc_nulls\": 0.49989795684814453, \"value_count\": 3912, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3912.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4123716354370117, \"percentile_inc_nulls\": 0.47849923372268677, \"value_count\": 3774, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3774.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3896331787109375, \"percentile_inc_nulls\": 0.45831960439682007, \"value_count\": 3559, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3559.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3688114285469055, \"percentile_inc_nulls\": 0.43984103202819824, \"value_count\": 3259, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3259.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.34828996658325195, \"percentile_inc_nulls\": 0.4216288924217224, \"value_count\": 3212, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3212.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.328139066696167, \"percentile_inc_nulls\": 0.4037455916404724, \"value_count\": 3154, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3154.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3089912533760071, \"percentile_inc_nulls\": 0.38675254583358765, \"value_count\": 2997, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2997.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2900094985961914, \"percentile_inc_nulls\": 0.3699069023132324, \"value_count\": 2971, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2971.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2711939215660095, \"percentile_inc_nulls\": 0.35320866107940674, \"value_count\": 2945, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2945.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.25474226474761963, \"percentile_inc_nulls\": 0.3386083245277405, \"value_count\": 2575, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2575.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2392297387123108, \"percentile_inc_nulls\": 0.3248414993286133, \"value_count\": 2428, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2428.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.22381949424743652, \"percentile_inc_nulls\": 0.31116539239883423, \"value_count\": 2412, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2412.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.20867115259170532, \"percentile_inc_nulls\": 0.297721803188324, \"value_count\": 2371, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2371.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.19549065828323364, \"percentile_inc_nulls\": 0.2860245108604431, \"value_count\": 2063, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2063.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.18243151903152466, \"percentile_inc_nulls\": 0.27443498373031616, \"value_count\": 2044, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2044.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.16952574253082275, \"percentile_inc_nulls\": 0.2629815340042114, \"value_count\": 2020, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2020.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.15754634141921997, \"percentile_inc_nulls\": 0.25235021114349365, \"value_count\": 1875, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1875.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.14563089609146118, \"percentile_inc_nulls\": 0.24177563190460205, \"value_count\": 1865, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1865.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.13393902778625488, \"percentile_inc_nulls\": 0.23139947652816772, \"value_count\": 1830, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1830.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.12323743104934692, \"percentile_inc_nulls\": 0.22190219163894653, \"value_count\": 1675, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1675.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.11320030689239502, \"percentile_inc_nulls\": 0.21299457550048828, \"value_count\": 1571, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1571.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.10325264930725098, \"percentile_inc_nulls\": 0.20416635274887085, \"value_count\": 1557, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1557.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.09345829486846924, \"percentile_inc_nulls\": 0.19547420740127563, \"value_count\": 1533, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1533.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.08405369520187378, \"percentile_inc_nulls\": 0.18712788820266724, \"value_count\": 1472, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1472.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.07524967193603516, \"percentile_inc_nulls\": 0.17931461334228516, \"value_count\": 1378, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1378.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.06659895181655884, \"percentile_inc_nulls\": 0.1716374158859253, \"value_count\": 1354, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1354.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.059660494327545166, \"percentile_inc_nulls\": 0.16547971963882446, \"value_count\": 1086, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1086.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.05314368009567261, \"percentile_inc_nulls\": 0.15969634056091309, \"value_count\": 1020, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1020.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.046812236309051514, \"percentile_inc_nulls\": 0.15407729148864746, \"value_count\": 991, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 991.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.040736258029937744, \"percentile_inc_nulls\": 0.14868509769439697, \"value_count\": 951, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 951.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.035171449184417725, \"percentile_inc_nulls\": 0.14374655485153198, \"value_count\": 871, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 871.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0299643874168396, \"percentile_inc_nulls\": 0.13912546634674072, \"value_count\": 815, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 815.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.025000154972076416, \"percentile_inc_nulls\": 0.1347198486328125, \"value_count\": 777, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 777.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.02038729190826416, \"percentile_inc_nulls\": 0.13062608242034912, \"value_count\": 722, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 722.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.016336679458618164, \"percentile_inc_nulls\": 0.12703126668930054, \"value_count\": 634, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 634.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.01256716251373291, \"percentile_inc_nulls\": 0.12368595600128174, \"value_count\": 590, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.009583473205566406, \"percentile_inc_nulls\": 0.12103807926177979, \"value_count\": 467, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 467.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.00688093900680542, \"percentile_inc_nulls\": 0.11863964796066284, \"value_count\": 423, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.004612863063812256, \"percentile_inc_nulls\": 0.1166267991065979, \"value_count\": 355, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.002830326557159424, \"percentile_inc_nulls\": 0.11504483222961426, \"value_count\": 279, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0016611218452453613, \"percentile_inc_nulls\": 0.11400723457336426, \"value_count\": 183, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0009072422981262207, \"percentile_inc_nulls\": 0.1133381724357605, \"value_count\": 118, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0005686283111572266, \"percentile_inc_nulls\": 0.11303764581680298, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0004088878631591797, \"percentile_inc_nulls\": 0.1128959059715271, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0002875328063964844, \"percentile_inc_nulls\": 0.11278820037841797, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 7.665157318115234e-05, \"percentile_inc_nulls\": 0.11260104179382324, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 3.1948089599609375e-05, \"percentile_inc_nulls\": 0.11256140470504761, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 6.377696990966797e-06, \"percentile_inc_nulls\": 0.11253869533538818, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.11253303289413452, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 14970, \"group_name\": \"_state_\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 14970.0, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 19,847 values (11.3%) are null and there are 63 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 14970, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 12920, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 8320, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 6989, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 6177, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5909, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5905, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 5228, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4824, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4723, \"group_name\": \"_state_\", \"value\": \"ia\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"mp\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 4, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 7, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 11, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}, {\"value_count\": 11, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 156519, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 14970]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9656325578689575, \"percentile_inc_nulls\": 0.9748250842094421, \"value_count\": 4440, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4440.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.9436729550361633, \"percentile_inc_nulls\": 0.9587392210960388, \"value_count\": 2837, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2837.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.9245464205741882, \"percentile_inc_nulls\": 0.9447285532951355, \"value_count\": 2471, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2471.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.908012866973877, \"percentile_inc_nulls\": 0.9326174259185791, \"value_count\": 2136, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2136.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8943200707435608, \"percentile_inc_nulls\": 0.9225870966911316, \"value_count\": 1769, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1769.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.881432294845581, \"percentile_inc_nulls\": 0.9131464958190918, \"value_count\": 1665, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1665.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8725308179855347, \"percentile_inc_nulls\": 0.9066259860992432, \"value_count\": 1150, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1150.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8643027544021606, \"percentile_inc_nulls\": 0.9005987644195557, \"value_count\": 1063, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8566164970397949, \"percentile_inc_nulls\": 0.8949683904647827, \"value_count\": 993, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 993.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8491934537887573, \"percentile_inc_nulls\": 0.8895308375358582, \"value_count\": 959, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 959.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.841948390007019, \"percentile_inc_nulls\": 0.8842236995697021, \"value_count\": 936, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 936.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8355780839920044, \"percentile_inc_nulls\": 0.8795572519302368, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8303300142288208, \"percentile_inc_nulls\": 0.8757129907608032, \"value_count\": 678, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 678.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8251284956932068, \"percentile_inc_nulls\": 0.8719027638435364, \"value_count\": 672, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 672.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8199656009674072, \"percentile_inc_nulls\": 0.8681208491325378, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.814818263053894, \"percentile_inc_nulls\": 0.8643502593040466, \"value_count\": 665, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 665.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8097947239875793, \"percentile_inc_nulls\": 0.8606704473495483, \"value_count\": 649, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8050498366355896, \"percentile_inc_nulls\": 0.8571946620941162, \"value_count\": 613, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.8005449175834656, \"percentile_inc_nulls\": 0.8538947105407715, \"value_count\": 582, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 582.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7961406111717224, \"percentile_inc_nulls\": 0.8506684899330139, \"value_count\": 569, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7919143438339233, \"percentile_inc_nulls\": 0.8475726842880249, \"value_count\": 546, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7879435420036316, \"percentile_inc_nulls\": 0.8446639180183411, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7840656042098999, \"percentile_inc_nulls\": 0.8418232202529907, \"value_count\": 501, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 501.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7802186012268066, \"percentile_inc_nulls\": 0.8390052318572998, \"value_count\": 497, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 497.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7763948440551758, \"percentile_inc_nulls\": 0.8362042903900146, \"value_count\": 494, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7726020216941833, \"percentile_inc_nulls\": 0.8334259390830994, \"value_count\": 490, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7688246965408325, \"percentile_inc_nulls\": 0.8306589722633362, \"value_count\": 488, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7613784074783325, \"percentile_inc_nulls\": 0.8252043724060059, \"value_count\": 481, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 962.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7577094435691833, \"percentile_inc_nulls\": 0.8225167989730835, \"value_count\": 474, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7504025101661682, \"percentile_inc_nulls\": 0.8171643018722534, \"value_count\": 472, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 944.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7467954754829407, \"percentile_inc_nulls\": 0.8145220875740051, \"value_count\": 466, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 466.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7432968020439148, \"percentile_inc_nulls\": 0.8119592070579529, \"value_count\": 452, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7398368120193481, \"percentile_inc_nulls\": 0.8094246983528137, \"value_count\": 447, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 447.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7364155650138855, \"percentile_inc_nulls\": 0.8069185614585876, \"value_count\": 442, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7331104278564453, \"percentile_inc_nulls\": 0.804497480392456, \"value_count\": 427, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7299058437347412, \"percentile_inc_nulls\": 0.8021500706672668, \"value_count\": 414, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7267787456512451, \"percentile_inc_nulls\": 0.7998594045639038, \"value_count\": 404, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7236748337745667, \"percentile_inc_nulls\": 0.7975857257843018, \"value_count\": 401, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 401.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7206483483314514, \"percentile_inc_nulls\": 0.7953687310218811, \"value_count\": 391, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 391.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7176682949066162, \"percentile_inc_nulls\": 0.7931857705116272, \"value_count\": 385, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7148430347442627, \"percentile_inc_nulls\": 0.7911162376403809, \"value_count\": 365, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 365.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7121338844299316, \"percentile_inc_nulls\": 0.7891317009925842, \"value_count\": 350, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7094789147377014, \"percentile_inc_nulls\": 0.787186861038208, \"value_count\": 343, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.706924557685852, \"percentile_inc_nulls\": 0.785315752029419, \"value_count\": 330, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7043780088424683, \"percentile_inc_nulls\": 0.7834503054618835, \"value_count\": 329, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 329.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.7020094394683838, \"percentile_inc_nulls\": 0.7817152738571167, \"value_count\": 306, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6997259855270386, \"percentile_inc_nulls\": 0.7800426483154297, \"value_count\": 295, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 295.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6974812746047974, \"percentile_inc_nulls\": 0.778398334980011, \"value_count\": 290, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6953061819076538, \"percentile_inc_nulls\": 0.77680504322052, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 281.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6931775808334351, \"percentile_inc_nulls\": 0.7752457857131958, \"value_count\": 275, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.688951313495636, \"percentile_inc_nulls\": 0.7721499800682068, \"value_count\": 273, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 546.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6847715377807617, \"percentile_inc_nulls\": 0.7690881490707397, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6827589869499207, \"percentile_inc_nulls\": 0.76761394739151, \"value_count\": 260, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6807929277420044, \"percentile_inc_nulls\": 0.7661737203598022, \"value_count\": 254, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6788965463638306, \"percentile_inc_nulls\": 0.764784574508667, \"value_count\": 245, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6770620346069336, \"percentile_inc_nulls\": 0.7634407877922058, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.675250768661499, \"percentile_inc_nulls\": 0.7621140480041504, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.673454999923706, \"percentile_inc_nulls\": 0.7607985734939575, \"value_count\": 232, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6717057228088379, \"percentile_inc_nulls\": 0.7595171332359314, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.670002818107605, \"percentile_inc_nulls\": 0.758269727230072, \"value_count\": 220, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6683308482170105, \"percentile_inc_nulls\": 0.7570450305938721, \"value_count\": 216, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6666666269302368, \"percentile_inc_nulls\": 0.755825936794281, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6650102138519287, \"percentile_inc_nulls\": 0.7546125650405884, \"value_count\": 214, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6584153771400452, \"percentile_inc_nulls\": 0.7497817277908325, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 852.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6567744016647339, \"percentile_inc_nulls\": 0.7485796213150024, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6551489233970642, \"percentile_inc_nulls\": 0.7473889589309692, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6535311937332153, \"percentile_inc_nulls\": 0.7462038993835449, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6519289016723633, \"percentile_inc_nulls\": 0.7450302243232727, \"value_count\": 207, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.650334358215332, \"percentile_inc_nulls\": 0.7438621520996094, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6487475633621216, \"percentile_inc_nulls\": 0.7426998615264893, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6471762657165527, \"percentile_inc_nulls\": 0.7415488362312317, \"value_count\": 203, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 203.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6456127166748047, \"percentile_inc_nulls\": 0.7404034733772278, \"value_count\": 202, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6440646648406982, \"percentile_inc_nulls\": 0.739269495010376, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6425474882125854, \"percentile_inc_nulls\": 0.738158106803894, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.641038179397583, \"percentile_inc_nulls\": 0.7370525002479553, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6396061778068542, \"percentile_inc_nulls\": 0.7360035181045532, \"value_count\": 185, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6381819248199463, \"percentile_inc_nulls\": 0.7349602580070496, \"value_count\": 184, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6353799104690552, \"percentile_inc_nulls\": 0.7329077124595642, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6339865922927856, \"percentile_inc_nulls\": 0.7318871021270752, \"value_count\": 180, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6326165795326233, \"percentile_inc_nulls\": 0.7308834791183472, \"value_count\": 177, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6286225318908691, \"percentile_inc_nulls\": 0.7279577255249023, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 516.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.627298891544342, \"percentile_inc_nulls\": 0.7269881963729858, \"value_count\": 171, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6260062456130981, \"percentile_inc_nulls\": 0.726041316986084, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.624721348285675, \"percentile_inc_nulls\": 0.725100040435791, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6222134828567505, \"percentile_inc_nulls\": 0.7232630252838135, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6209672689437866, \"percentile_inc_nulls\": 0.7223501205444336, \"value_count\": 161, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6197519898414612, \"percentile_inc_nulls\": 0.7214599251747131, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6185522079467773, \"percentile_inc_nulls\": 0.7205810546875, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6173679828643799, \"percentile_inc_nulls\": 0.719713568687439, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6150148510932922, \"percentile_inc_nulls\": 0.7179898619651794, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6138460636138916, \"percentile_inc_nulls\": 0.7171337008476257, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6103628873825073, \"percentile_inc_nulls\": 0.7145822048187256, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6080716848373413, \"percentile_inc_nulls\": 0.7129038572311401, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6069415807723999, \"percentile_inc_nulls\": 0.7120760679244995, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6047123670578003, \"percentile_inc_nulls\": 0.7104430198669434, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6025450229644775, \"percentile_inc_nulls\": 0.7088554501533508, \"value_count\": 140, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.6014846563339233, \"percentile_inc_nulls\": 0.7080786228179932, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5993791818618774, \"percentile_inc_nulls\": 0.7065364122390747, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5972893238067627, \"percentile_inc_nulls\": 0.7050055265426636, \"value_count\": 135, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5962598323822021, \"percentile_inc_nulls\": 0.7042514085769653, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5942318439483643, \"percentile_inc_nulls\": 0.7027658224105835, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 262.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5912131071090698, \"percentile_inc_nulls\": 0.7005544900894165, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5902145504951477, \"percentile_inc_nulls\": 0.6998230814933777, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5892238020896912, \"percentile_inc_nulls\": 0.6990973353385925, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5872577428817749, \"percentile_inc_nulls\": 0.6976571083068848, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5833565592765808, \"percentile_inc_nulls\": 0.6947994232177734, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5823889970779419, \"percentile_inc_nulls\": 0.694090723991394, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5804848670959473, \"percentile_inc_nulls\": 0.6926958560943604, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.577651858329773, \"percentile_inc_nulls\": 0.6906206607818604, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.574842095375061, \"percentile_inc_nulls\": 0.6885623931884766, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 363.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5739209651947021, \"percentile_inc_nulls\": 0.6878876686096191, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5730153322219849, \"percentile_inc_nulls\": 0.687224268913269, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5712195634841919, \"percentile_inc_nulls\": 0.685908854007721, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5694392919540405, \"percentile_inc_nulls\": 0.6846047639846802, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5685569047927856, \"percentile_inc_nulls\": 0.6839583516120911, \"value_count\": 114, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5659328699111938, \"percentile_inc_nulls\": 0.6820362210273743, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5641990303993225, \"percentile_inc_nulls\": 0.6807661056518555, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5624806880950928, \"percentile_inc_nulls\": 0.6795073747634888, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5616291761398315, \"percentile_inc_nulls\": 0.6788836717605591, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5599417686462402, \"percentile_inc_nulls\": 0.677647590637207, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5574339032173157, \"percentile_inc_nulls\": 0.6758105158805847, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5541210174560547, \"percentile_inc_nulls\": 0.6733837723731995, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5516595840454102, \"percentile_inc_nulls\": 0.6715806722640991, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5508468151092529, \"percentile_inc_nulls\": 0.6709853410720825, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.547626793384552, \"percentile_inc_nulls\": 0.668626606464386, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5452350378036499, \"percentile_inc_nulls\": 0.6668745279312134, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 309.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5436559915542603, \"percentile_inc_nulls\": 0.6657178401947021, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5420923829078674, \"percentile_inc_nulls\": 0.664572536945343, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5389962196350098, \"percentile_inc_nulls\": 0.6623045206069946, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5351647138595581, \"percentile_inc_nulls\": 0.659497857093811, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5328890085220337, \"percentile_inc_nulls\": 0.6578308343887329, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5313873887062073, \"percentile_inc_nulls\": 0.6567308902740479, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5291581153869629, \"percentile_inc_nulls\": 0.6550979614257812, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5284228324890137, \"percentile_inc_nulls\": 0.6545592546463013, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5247848033905029, \"percentile_inc_nulls\": 0.6518943309783936, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 470.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5211855173110962, \"percentile_inc_nulls\": 0.6492577791213989, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5204733610153198, \"percentile_inc_nulls\": 0.6487361192703247, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5176868438720703, \"percentile_inc_nulls\": 0.6466948986053467, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5163090229034424, \"percentile_inc_nulls\": 0.6456856727600098, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5129032731056213, \"percentile_inc_nulls\": 0.643190860748291, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5095362067222595, \"percentile_inc_nulls\": 0.6407244205474854, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5068734884262085, \"percentile_inc_nulls\": 0.6387739181518555, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5042417049407959, \"percentile_inc_nulls\": 0.6368460655212402, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5035915374755859, \"percentile_inc_nulls\": 0.6363698244094849, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.5023066401481628, \"percentile_inc_nulls\": 0.6354286074638367, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4984983801841736, \"percentile_inc_nulls\": 0.6326389312744141, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4947364926338196, \"percentile_inc_nulls\": 0.6298832893371582, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 486.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.49225956201553345, \"percentile_inc_nulls\": 0.6280689239501953, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.49042510986328125, \"percentile_inc_nulls\": 0.6267250776290894, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4886138439178467, \"percentile_inc_nulls\": 0.6253982782363892, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4880255460739136, \"percentile_inc_nulls\": 0.6249673962593079, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.48396188020706177, \"percentile_inc_nulls\": 0.6219906210899353, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 525.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.48109787702560425, \"percentile_inc_nulls\": 0.6198927164077759, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47940272092819214, \"percentile_inc_nulls\": 0.6186509728431702, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47773081064224243, \"percentile_inc_nulls\": 0.6174262762069702, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47608208656311035, \"percentile_inc_nulls\": 0.6162185668945312, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4722893238067627, \"percentile_inc_nulls\": 0.6134402751922607, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.47015297412872314, \"percentile_inc_nulls\": 0.6118752956390381, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46752119064331055, \"percentile_inc_nulls\": 0.6099475026130676, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4659653902053833, \"percentile_inc_nulls\": 0.6088078022003174, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46494364738464355, \"percentile_inc_nulls\": 0.6080594062805176, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46293115615844727, \"percentile_inc_nulls\": 0.6065851449966431, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46194034814834595, \"percentile_inc_nulls\": 0.6058593988418579, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.46047741174697876, \"percentile_inc_nulls\": 0.6047877669334412, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4566381573677063, \"percentile_inc_nulls\": 0.6019754409790039, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.45286083221435547, \"percentile_inc_nulls\": 0.5992084741592407, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 488.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4496098756790161, \"percentile_inc_nulls\": 0.5968270301818848, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4477831721305847, \"percentile_inc_nulls\": 0.5954889059066772, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.44374263286590576, \"percentile_inc_nulls\": 0.5925291776657104, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 522.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.439771831035614, \"percentile_inc_nulls\": 0.5896204710006714, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.43760448694229126, \"percentile_inc_nulls\": 0.5880328416824341, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.43292152881622314, \"percentile_inc_nulls\": 0.5846024751663208, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 605.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4308316111564636, \"percentile_inc_nulls\": 0.5830715894699097, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4263189435005188, \"percentile_inc_nulls\": 0.5797659158706665, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 583.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4222939610481262, \"percentile_inc_nulls\": 0.576817512512207, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 520.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.41716206073760986, \"percentile_inc_nulls\": 0.5730583071708679, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4105827212333679, \"percentile_inc_nulls\": 0.5682387351989746, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 850.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4083070158958435, \"percentile_inc_nulls\": 0.566571831703186, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.4038485288619995, \"percentile_inc_nulls\": 0.5633058547973633, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.40202951431274414, \"percentile_inc_nulls\": 0.5619733929634094, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3913477659225464, \"percentile_inc_nulls\": 0.55414879322052, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1380.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3882129192352295, \"percentile_inc_nulls\": 0.5518524050712585, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 405.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.38174188137054443, \"percentile_inc_nulls\": 0.547112226486206, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 836.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3760836720466614, \"percentile_inc_nulls\": 0.5429674386978149, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 731.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.37250757217407227, \"percentile_inc_nulls\": 0.5403479337692261, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.368699312210083, \"percentile_inc_nulls\": 0.5375582575798035, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 492.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.36374545097351074, \"percentile_inc_nulls\": 0.5339294672012329, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3577079176902771, \"percentile_inc_nulls\": 0.5295068025588989, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.35182517766952515, \"percentile_inc_nulls\": 0.5251976251602173, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 760.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.34180134534835815, \"percentile_inc_nulls\": 0.5178549289703369, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1295.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3359495997428894, \"percentile_inc_nulls\": 0.5135684013366699, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 756.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.33269864320755005, \"percentile_inc_nulls\": 0.511186957359314, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.32690876722335815, \"percentile_inc_nulls\": 0.506945788860321, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3223109841346741, \"percentile_inc_nulls\": 0.5035778284072876, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.3128986358642578, \"percentile_inc_nulls\": 0.4966830611228943, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1216.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.30881941318511963, \"percentile_inc_nulls\": 0.49369490146636963, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 527.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.28861695528030396, \"percentile_inc_nulls\": 0.4788961410522461, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2610.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2814338207244873, \"percentile_inc_nulls\": 0.47363436222076416, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.27428168058395386, \"percentile_inc_nulls\": 0.46839529275894165, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 924.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2682209610939026, \"percentile_inc_nulls\": 0.4639556407928467, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.26479965448379517, \"percentile_inc_nulls\": 0.4614495038986206, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.2576397657394409, \"percentile_inc_nulls\": 0.45620471239089966, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 925.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.24946588277816772, \"percentile_inc_nulls\": 0.45021718740463257, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1056.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1960570216178894, \"percentile_inc_nulls\": 0.41109395027160645, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6900.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1882237195968628, \"percentile_inc_nulls\": 0.40535593032836914, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.18009626865386963, \"percentile_inc_nulls\": 0.399402379989624, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.17328470945358276, \"percentile_inc_nulls\": 0.3944127559661865, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 880.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.16504889726638794, \"percentile_inc_nulls\": 0.38837987184524536, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1064.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.15641063451766968, \"percentile_inc_nulls\": 0.38205212354660034, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1116.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.15062075853347778, \"percentile_inc_nulls\": 0.37781089544296265, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.14058917760849, \"percentile_inc_nulls\": 0.3704625368118286, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1296.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.1346677541732788, \"percentile_inc_nulls\": 0.3661249876022339, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 765.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.11852127313613892, \"percentile_inc_nulls\": 0.35429733991622925, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2086.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.11147749423980713, \"percentile_inc_nulls\": 0.3491376042366028, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 910.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.10451114177703857, \"percentile_inc_nulls\": 0.3440345525741577, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.09846585988998413, \"percentile_inc_nulls\": 0.33960628509521484, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 781.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0922735333442688, \"percentile_inc_nulls\": 0.33507025241851807, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 800.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.08335655927658081, \"percentile_inc_nulls\": 0.32853835821151733, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1152.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.07778346538543701, \"percentile_inc_nulls\": 0.32445597648620605, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.015798211097717285, \"percentile_inc_nulls\": 0.27905040979385376, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8008.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.011850595474243164, \"percentile_inc_nulls\": 0.2761586904525757, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.008251309394836426, \"percentile_inc_nulls\": 0.2735220789909363, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0051860809326171875, \"percentile_inc_nulls\": 0.2712767720222473, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.002980053424835205, \"percentile_inc_nulls\": 0.2696608304977417, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.001215219497680664, \"percentile_inc_nulls\": 0.26836806535720825, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.2674778699874878, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 4230}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 4440, \"group_name\": \"_city_\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4440.0, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 47,174 values (26.7%) are null and there are 4230 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 4440, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2837, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2471, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 2136, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1769, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1665, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1150, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1063, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 993, \"group_name\": \"_city_\", \"value\": \"chapel hill\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 959, \"group_name\": \"_city_\", \"value\": \"omaha\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"canada     l6h 7h7\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"granite bay\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"black mountain\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"schereville\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ontario cn n2z2x6\", \"total_non_null_rows\": 129192, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 4230}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 4440]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9821445345878601, \"percentile_inc_nulls\": 0.9893006682395935, \"value_count\": 1887, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1887.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9768267273902893, \"percentile_inc_nulls\": 0.9861140847206116, \"value_count\": 562, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9727673530578613, \"percentile_inc_nulls\": 0.9836816787719727, \"value_count\": 429, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9687647819519043, \"percentile_inc_nulls\": 0.9812832474708557, \"value_count\": 423, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9648095369338989, \"percentile_inc_nulls\": 0.9789131879806519, \"value_count\": 418, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9614787697792053, \"percentile_inc_nulls\": 0.9769173264503479, \"value_count\": 352, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9581764340400696, \"percentile_inc_nulls\": 0.9749384522438049, \"value_count\": 349, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9554323554039001, \"percentile_inc_nulls\": 0.973294198513031, \"value_count\": 290, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.952915370464325, \"percentile_inc_nulls\": 0.9717859625816345, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9504267573356628, \"percentile_inc_nulls\": 0.970294713973999, \"value_count\": 263, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 263.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9479759931564331, \"percentile_inc_nulls\": 0.968826174736023, \"value_count\": 259, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9457144737243652, \"percentile_inc_nulls\": 0.9674710631370544, \"value_count\": 239, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 239.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9415321350097656, \"percentile_inc_nulls\": 0.9649649262428284, \"value_count\": 221, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9396585822105408, \"percentile_inc_nulls\": 0.9638422131538391, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9378323554992676, \"percentile_inc_nulls\": 0.9627479314804077, \"value_count\": 193, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9360250234603882, \"percentile_inc_nulls\": 0.9616649746894836, \"value_count\": 191, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.934425950050354, \"percentile_inc_nulls\": 0.9607067108154297, \"value_count\": 169, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 169.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9328362345695496, \"percentile_inc_nulls\": 0.9597541689872742, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9312560558319092, \"percentile_inc_nulls\": 0.9588072299957275, \"value_count\": 167, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9297041893005371, \"percentile_inc_nulls\": 0.9578773975372314, \"value_count\": 164, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9266951680183411, \"percentile_inc_nulls\": 0.9560742974281311, \"value_count\": 159, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9252285361289978, \"percentile_inc_nulls\": 0.955195426940918, \"value_count\": 155, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9238280653953552, \"percentile_inc_nulls\": 0.95435631275177, \"value_count\": 148, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9197971224784851, \"percentile_inc_nulls\": 0.9519408345222473, \"value_count\": 142, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 426.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9184629321098328, \"percentile_inc_nulls\": 0.951141357421875, \"value_count\": 141, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9171571135520935, \"percentile_inc_nulls\": 0.9503589272499084, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9132680892944336, \"percentile_inc_nulls\": 0.9480285048484802, \"value_count\": 137, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 411.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9119907021522522, \"percentile_inc_nulls\": 0.9472630620002747, \"value_count\": 135, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9107416868209839, \"percentile_inc_nulls\": 0.9465146064758301, \"value_count\": 132, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9095020890235901, \"percentile_inc_nulls\": 0.9457718729972839, \"value_count\": 131, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9083003997802734, \"percentile_inc_nulls\": 0.9450517892837524, \"value_count\": 127, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9071081280708313, \"percentile_inc_nulls\": 0.9443373680114746, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9059253334999084, \"percentile_inc_nulls\": 0.9436286091804504, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.904780387878418, \"percentile_inc_nulls\": 0.9429425001144409, \"value_count\": 121, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9036449193954468, \"percentile_inc_nulls\": 0.9422621130943298, \"value_count\": 120, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9014496207237244, \"percentile_inc_nulls\": 0.9409466981887817, \"value_count\": 116, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.9003614783287048, \"percentile_inc_nulls\": 0.940294623374939, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8982418775558472, \"percentile_inc_nulls\": 0.9390245079994202, \"value_count\": 112, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8972010612487793, \"percentile_inc_nulls\": 0.9384008049964905, \"value_count\": 110, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8951382637023926, \"percentile_inc_nulls\": 0.9371647834777832, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8941162824630737, \"percentile_inc_nulls\": 0.9365524053573608, \"value_count\": 108, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8931133151054382, \"percentile_inc_nulls\": 0.9359513521194458, \"value_count\": 106, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8921197652816772, \"percentile_inc_nulls\": 0.9353560209274292, \"value_count\": 105, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8911356925964355, \"percentile_inc_nulls\": 0.9347663521766663, \"value_count\": 104, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8901799917221069, \"percentile_inc_nulls\": 0.9341936707496643, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8883064389228821, \"percentile_inc_nulls\": 0.9330710172653198, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8864896297454834, \"percentile_inc_nulls\": 0.9319823384284973, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8855907320976257, \"percentile_inc_nulls\": 0.9314436912536621, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8847012519836426, \"percentile_inc_nulls\": 0.9309107065200806, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8838212490081787, \"percentile_inc_nulls\": 0.9303833842277527, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8820991516113281, \"percentile_inc_nulls\": 0.9293514490127563, \"value_count\": 91, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8812569975852966, \"percentile_inc_nulls\": 0.9288468360900879, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8795915842056274, \"percentile_inc_nulls\": 0.9278489351272583, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8787683844566345, \"percentile_inc_nulls\": 0.9273555874824524, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8771408796310425, \"percentile_inc_nulls\": 0.9263803958892822, \"value_count\": 86, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8747279644012451, \"percentile_inc_nulls\": 0.9249345064163208, \"value_count\": 85, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8739426136016846, \"percentile_inc_nulls\": 0.9244638681411743, \"value_count\": 83, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8724097013473511, \"percentile_inc_nulls\": 0.9235453605651855, \"value_count\": 81, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8708956837654114, \"percentile_inc_nulls\": 0.9226381778717041, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8701576590538025, \"percentile_inc_nulls\": 0.9221959114074707, \"value_count\": 78, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8694290518760681, \"percentile_inc_nulls\": 0.921759307384491, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8672716617584229, \"percentile_inc_nulls\": 0.9204665422439575, \"value_count\": 76, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8658522367477417, \"percentile_inc_nulls\": 0.9196160435676575, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8651520609855652, \"percentile_inc_nulls\": 0.9191964268684387, \"value_count\": 74, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8630797863006592, \"percentile_inc_nulls\": 0.917954683303833, \"value_count\": 73, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8617172241210938, \"percentile_inc_nulls\": 0.9171382188796997, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8590677976608276, \"percentile_inc_nulls\": 0.9155505895614624, \"value_count\": 70, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.858414888381958, \"percentile_inc_nulls\": 0.9151594042778015, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8564845323562622, \"percentile_inc_nulls\": 0.9140027165412903, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8552166223526001, \"percentile_inc_nulls\": 0.9132429361343384, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8545920848846436, \"percentile_inc_nulls\": 0.9128686785697937, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8539770245552063, \"percentile_inc_nulls\": 0.9125001430511475, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8521602749824524, \"percentile_inc_nulls\": 0.9114115238189697, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8515641093254089, \"percentile_inc_nulls\": 0.911054253578186, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.849804162979126, \"percentile_inc_nulls\": 0.909999668598175, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8492269515991211, \"percentile_inc_nulls\": 0.9096537828445435, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8452527523040771, \"percentile_inc_nulls\": 0.9072723984718323, \"value_count\": 60, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8430196046829224, \"percentile_inc_nulls\": 0.9059342741966248, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8419219851493835, \"percentile_inc_nulls\": 0.9052765369415283, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8408433198928833, \"percentile_inc_nulls\": 0.9046301245689392, \"value_count\": 57, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8397834897041321, \"percentile_inc_nulls\": 0.9039950966835022, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8356200456619263, \"percentile_inc_nulls\": 0.9015002846717834, \"value_count\": 55, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8330652117729187, \"percentile_inc_nulls\": 0.8999693989753723, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8320622444152832, \"percentile_inc_nulls\": 0.8993683457374573, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8291099667549133, \"percentile_inc_nulls\": 0.8975993394851685, \"value_count\": 52, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8262144923210144, \"percentile_inc_nulls\": 0.8958643078804016, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8238489031791687, \"percentile_inc_nulls\": 0.8944467902183533, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8224579095840454, \"percentile_inc_nulls\": 0.8936132788658142, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8188244104385376, \"percentile_inc_nulls\": 0.891435980796814, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8161560297012329, \"percentile_inc_nulls\": 0.8898370265960693, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8126738667488098, \"percentile_inc_nulls\": 0.887750506401062, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8105448484420776, \"percentile_inc_nulls\": 0.8864747285842896, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8076304197311401, \"percentile_inc_nulls\": 0.8847283720970154, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8055960536003113, \"percentile_inc_nulls\": 0.8835092782974243, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.801224410533905, \"percentile_inc_nulls\": 0.8808897733688354, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.8004485368728638, \"percentile_inc_nulls\": 0.8804247975349426, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.796285092830658, \"percentile_inc_nulls\": 0.8779299855232239, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7944399118423462, \"percentile_inc_nulls\": 0.8768243193626404, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7912037968635559, \"percentile_inc_nulls\": 0.8748852014541626, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 342.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7887530326843262, \"percentile_inc_nulls\": 0.8734166622161865, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7843246459960938, \"percentile_inc_nulls\": 0.8707630634307861, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7810128331184387, \"percentile_inc_nulls\": 0.8687785863876343, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7771521806716919, \"percentile_inc_nulls\": 0.8664652109146118, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7730928659439087, \"percentile_inc_nulls\": 0.8640327453613281, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7676425576210022, \"percentile_inc_nulls\": 0.8607668280601501, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7632425427436829, \"percentile_inc_nulls\": 0.8581302165985107, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7578490376472473, \"percentile_inc_nulls\": 0.8548983335494995, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7540073394775391, \"percentile_inc_nulls\": 0.8525962829589844, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7471187114715576, \"percentile_inc_nulls\": 0.848468542098999, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 728.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7412425875663757, \"percentile_inc_nulls\": 0.8449474573135376, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7358301281929016, \"percentile_inc_nulls\": 0.8417041897773743, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 572.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7301527261734009, \"percentile_inc_nulls\": 0.8383021354675293, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 600.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7231127023696899, \"percentile_inc_nulls\": 0.8340836763381958, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.7181071639060974, \"percentile_inc_nulls\": 0.8310842514038086, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 529.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6414999961853027, \"percentile_inc_nulls\": 0.7851796746253967, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 8096.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6252058148384094, \"percentile_inc_nulls\": 0.7754158973693848, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1722.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.6036316156387329, \"percentile_inc_nulls\": 0.7624882459640503, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2280.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5906871557235718, \"percentile_inc_nulls\": 0.7547316551208496, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1368.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5736548900604248, \"percentile_inc_nulls\": 0.7445255517959595, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5601426959037781, \"percentile_inc_nulls\": 0.7364287972450256, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1428.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.545457124710083, \"percentile_inc_nulls\": 0.7276289463043213, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1552.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5306958556175232, \"percentile_inc_nulls\": 0.7187836766242981, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1560.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.5112223625183105, \"percentile_inc_nulls\": 0.7071147561073303, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2058.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.46656954288482666, \"percentile_inc_nulls\": 0.6803579330444336, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4719.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.4002573490142822, \"percentile_inc_nulls\": 0.6406223773956299, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 7008.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.3459245562553406, \"percentile_inc_nulls\": 0.6080650091171265, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5742.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.3001267910003662, \"percentile_inc_nulls\": 0.5806220769882202, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4840.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.2458791732788086, \"percentile_inc_nulls\": 0.5481158494949341, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 5733.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.2023523449897766, \"percentile_inc_nulls\": 0.52203369140625, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 4600.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.13657957315444946, \"percentile_inc_nulls\": 0.48262137174606323, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 6951.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.10535377264022827, \"percentile_inc_nulls\": 0.4639102816581726, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3300.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.07720333337783813, \"percentile_inc_nulls\": 0.44704192876815796, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2975.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.0521470308303833, \"percentile_inc_nulls\": 0.4320276975631714, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2648.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.03261673450469971, \"percentile_inc_nulls\": 0.42032480239868164, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2064.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.014959990978240967, \"percentile_inc_nulls\": 0.4097445011138916, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1866.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.40078020095825195, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1581.0, \"distinct_value_count\": 10844}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1887, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1887.0, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 70,684 values (40.1%) are null and there are 10844 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1887, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 562, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd.\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 429, \"group_name\": \"_street_address_\", \"value\": \"130 roberts street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 423, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st, suite 200\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 418, \"group_name\": \"_street_address_\", \"value\": \"300 exelon way\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 352, \"group_name\": \"_street_address_\", \"value\": \"14302 fnb parkway\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 349, \"group_name\": \"_street_address_\", \"value\": \"1519 king street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 290, \"group_name\": \"_street_address_\", \"value\": \"804 carnegie center\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"5400 westheimer court\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 263, \"group_name\": \"_street_address_\", \"value\": \"333 washington street\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"401 n michigan ave\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"c/o soltage llc, 66 york street, 5th floor\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"c/o enel north america  inc\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1300 n 17th st.\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"12200 ashcake rd\", \"total_non_null_rows\": 105682, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 10844}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 1887]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9781083464622498, \"percentile_inc_nulls\": 0.984095573425293, \"value_count\": 2805, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2805.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9638494849205017, \"percentile_inc_nulls\": 0.9737364053726196, \"value_count\": 1827, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1827.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9563883543014526, \"percentile_inc_nulls\": 0.9683158993721008, \"value_count\": 956, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 956.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9495516419410706, \"percentile_inc_nulls\": 0.963348925113678, \"value_count\": 876, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 876.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9427851438522339, \"percentile_inc_nulls\": 0.9584330320358276, \"value_count\": 867, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 867.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9373453855514526, \"percentile_inc_nulls\": 0.9544810056686401, \"value_count\": 697, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 697.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9322100281715393, \"percentile_inc_nulls\": 0.9507501721382141, \"value_count\": 658, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9273243546485901, \"percentile_inc_nulls\": 0.9472007155418396, \"value_count\": 626, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 626.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.922735333442688, \"percentile_inc_nulls\": 0.9438667297363281, \"value_count\": 588, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9182945489883423, \"percentile_inc_nulls\": 0.9406405091285706, \"value_count\": 569, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9141347408294678, \"percentile_inc_nulls\": 0.9376183748245239, \"value_count\": 533, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9102013111114502, \"percentile_inc_nulls\": 0.9347606897354126, \"value_count\": 504, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.906283438205719, \"percentile_inc_nulls\": 0.9319143295288086, \"value_count\": 502, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 502.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.9026777148246765, \"percentile_inc_nulls\": 0.929294764995575, \"value_count\": 462, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.89908766746521, \"percentile_inc_nulls\": 0.9266865253448486, \"value_count\": 460, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8955131769180298, \"percentile_inc_nulls\": 0.9240896701812744, \"value_count\": 458, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 458.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8920089602470398, \"percentile_inc_nulls\": 0.9215438365936279, \"value_count\": 449, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8885515928268433, \"percentile_inc_nulls\": 0.9190320372581482, \"value_count\": 443, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 443.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8852112293243408, \"percentile_inc_nulls\": 0.9166052341461182, \"value_count\": 428, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 428.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8819957971572876, \"percentile_inc_nulls\": 0.914269208908081, \"value_count\": 412, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8788739442825317, \"percentile_inc_nulls\": 0.9120011925697327, \"value_count\": 400, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 400.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8757911920547485, \"percentile_inc_nulls\": 0.9097615480422974, \"value_count\": 395, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 395.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8730127811431885, \"percentile_inc_nulls\": 0.907742977142334, \"value_count\": 356, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 356.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.870296835899353, \"percentile_inc_nulls\": 0.9057698249816895, \"value_count\": 348, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8681193590164185, \"percentile_inc_nulls\": 0.9041878581047058, \"value_count\": 279, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8659496903419495, \"percentile_inc_nulls\": 0.9026116132736206, \"value_count\": 278, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8638424873352051, \"percentile_inc_nulls\": 0.9010807275772095, \"value_count\": 270, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8597372770309448, \"percentile_inc_nulls\": 0.8980982899665833, \"value_count\": 263, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8577159643173218, \"percentile_inc_nulls\": 0.8966297507286072, \"value_count\": 259, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8557179570198059, \"percentile_inc_nulls\": 0.8951781988143921, \"value_count\": 256, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8537512421607971, \"percentile_inc_nulls\": 0.8937493562698364, \"value_count\": 252, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8517923355102539, \"percentile_inc_nulls\": 0.8923261761665344, \"value_count\": 251, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8498489856719971, \"percentile_inc_nulls\": 0.8909143209457397, \"value_count\": 249, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8479368686676025, \"percentile_inc_nulls\": 0.8895251750946045, \"value_count\": 245, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8460325598716736, \"percentile_inc_nulls\": 0.8881416916847229, \"value_count\": 244, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8441438674926758, \"percentile_inc_nulls\": 0.8867695331573486, \"value_count\": 242, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8422864079475403, \"percentile_inc_nulls\": 0.8854200839996338, \"value_count\": 238, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8404601812362671, \"percentile_inc_nulls\": 0.8840932846069336, \"value_count\": 234, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8368232250213623, \"percentile_inc_nulls\": 0.8814510703086853, \"value_count\": 233, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 466.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8332019448280334, \"percentile_inc_nulls\": 0.8788201808929443, \"value_count\": 232, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8314459323883057, \"percentile_inc_nulls\": 0.8775444030761719, \"value_count\": 225, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8279807567596436, \"percentile_inc_nulls\": 0.8750269412994385, \"value_count\": 222, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 444.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8262715339660645, \"percentile_inc_nulls\": 0.8737851977348328, \"value_count\": 219, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 219.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8246170282363892, \"percentile_inc_nulls\": 0.8725831508636475, \"value_count\": 212, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8229702711105347, \"percentile_inc_nulls\": 0.8713867664337158, \"value_count\": 211, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 211.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8213468790054321, \"percentile_inc_nulls\": 0.8702074289321899, \"value_count\": 208, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8198094367980957, \"percentile_inc_nulls\": 0.8690904378890991, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8183343410491943, \"percentile_inc_nulls\": 0.8680187463760376, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8168749213218689, \"percentile_inc_nulls\": 0.866958498954773, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8154311180114746, \"percentile_inc_nulls\": 0.8659095168113708, \"value_count\": 185, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8139950633049011, \"percentile_inc_nulls\": 0.8648662567138672, \"value_count\": 184, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8125668168067932, \"percentile_inc_nulls\": 0.8638286590576172, \"value_count\": 183, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 183.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8097416162490845, \"percentile_inc_nulls\": 0.8617761135101318, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 362.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8083524107933044, \"percentile_inc_nulls\": 0.8607668280601501, \"value_count\": 178, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8069709539413452, \"percentile_inc_nulls\": 0.8597632050514221, \"value_count\": 177, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 177.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8056130409240723, \"percentile_inc_nulls\": 0.8587766289710999, \"value_count\": 174, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8042628169059753, \"percentile_inc_nulls\": 0.8577957153320312, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8029204607009888, \"percentile_inc_nulls\": 0.8568204641342163, \"value_count\": 172, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.8002669215202332, \"percentile_inc_nulls\": 0.8548926711082458, \"value_count\": 170, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7976446151733398, \"percentile_inc_nulls\": 0.85298752784729, \"value_count\": 168, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7950378656387329, \"percentile_inc_nulls\": 0.8510937690734863, \"value_count\": 167, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 334.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7937423586845398, \"percentile_inc_nulls\": 0.8501524925231934, \"value_count\": 166, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7924546003341675, \"percentile_inc_nulls\": 0.8492169380187988, \"value_count\": 165, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7911824584007263, \"percentile_inc_nulls\": 0.8482927680015564, \"value_count\": 163, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 163.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7886537909507751, \"percentile_inc_nulls\": 0.8464556932449341, \"value_count\": 162, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7861407399177551, \"percentile_inc_nulls\": 0.8446298837661743, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 322.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7848920226097107, \"percentile_inc_nulls\": 0.8437227010726929, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7836511135101318, \"percentile_inc_nulls\": 0.8428211808204651, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7824180126190186, \"percentile_inc_nulls\": 0.841925323009491, \"value_count\": 158, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7812082767486572, \"percentile_inc_nulls\": 0.8410464525222778, \"value_count\": 155, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7800064086914062, \"percentile_inc_nulls\": 0.8401732444763184, \"value_count\": 154, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7788200974464417, \"percentile_inc_nulls\": 0.839311420917511, \"value_count\": 152, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7776416540145874, \"percentile_inc_nulls\": 0.8384552597999573, \"value_count\": 151, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.776470959186554, \"percentile_inc_nulls\": 0.8376047611236572, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7753158807754517, \"percentile_inc_nulls\": 0.8367655873298645, \"value_count\": 148, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7741842269897461, \"percentile_inc_nulls\": 0.8359434604644775, \"value_count\": 145, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7730681896209717, \"percentile_inc_nulls\": 0.8351325988769531, \"value_count\": 143, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7719599604606628, \"percentile_inc_nulls\": 0.8343274593353271, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7708594799041748, \"percentile_inc_nulls\": 0.8335279822349548, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7687054872512817, \"percentile_inc_nulls\": 0.831963062286377, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.767636239528656, \"percentile_inc_nulls\": 0.8311862945556641, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7655134201049805, \"percentile_inc_nulls\": 0.8296440243721008, \"value_count\": 136, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7644597887992859, \"percentile_inc_nulls\": 0.8288785815238953, \"value_count\": 135, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7634140253067017, \"percentile_inc_nulls\": 0.8281188011169434, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7623916268348694, \"percentile_inc_nulls\": 0.8273760080337524, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7603780627250671, \"percentile_inc_nulls\": 0.8259131908416748, \"value_count\": 129, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7593790888786316, \"percentile_inc_nulls\": 0.8251873850822449, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7573966979980469, \"percentile_inc_nulls\": 0.8237472176551819, \"value_count\": 127, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7564133405685425, \"percentile_inc_nulls\": 0.823032796382904, \"value_count\": 126, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7554377913475037, \"percentile_inc_nulls\": 0.8223240375518799, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7515667676925659, \"percentile_inc_nulls\": 0.8195117115974426, \"value_count\": 124, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 496.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.750614583492279, \"percentile_inc_nulls\": 0.8188199400901794, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7487415075302124, \"percentile_inc_nulls\": 0.8174591660499573, \"value_count\": 120, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7478127479553223, \"percentile_inc_nulls\": 0.8167843818664551, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7450499534606934, \"percentile_inc_nulls\": 0.8147772550582886, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7441368699073792, \"percentile_inc_nulls\": 0.8141138553619385, \"value_count\": 117, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7432393431663513, \"percentile_inc_nulls\": 0.8134617805480957, \"value_count\": 115, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7423496246337891, \"percentile_inc_nulls\": 0.8128154277801514, \"value_count\": 114, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7414677143096924, \"percentile_inc_nulls\": 0.8121746778488159, \"value_count\": 113, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7397195100784302, \"percentile_inc_nulls\": 0.8109046220779419, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7388532161712646, \"percentile_inc_nulls\": 0.8102751970291138, \"value_count\": 111, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7379946708679199, \"percentile_inc_nulls\": 0.8096514940261841, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7362933158874512, \"percentile_inc_nulls\": 0.8084154725074768, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7354504466056824, \"percentile_inc_nulls\": 0.8078030943870544, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7346231937408447, \"percentile_inc_nulls\": 0.8072021007537842, \"value_count\": 106, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.732984185218811, \"percentile_inc_nulls\": 0.8060113787651062, \"value_count\": 105, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.731360912322998, \"percentile_inc_nulls\": 0.8048319816589355, \"value_count\": 104, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7281454205513, \"percentile_inc_nulls\": 0.8024959564208984, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 412.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7273493409156799, \"percentile_inc_nulls\": 0.8019176125526428, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7257884740829468, \"percentile_inc_nulls\": 0.800783634185791, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.725023627281189, \"percentile_inc_nulls\": 0.80022794008255, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7235095500946045, \"percentile_inc_nulls\": 0.7991279363632202, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7227603197097778, \"percentile_inc_nulls\": 0.7985836267471313, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7212774753570557, \"percentile_inc_nulls\": 0.7975063323974609, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7190765738487244, \"percentile_inc_nulls\": 0.7959073781967163, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7168990969657898, \"percentile_inc_nulls\": 0.7943254709243774, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 279.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7140582799911499, \"percentile_inc_nulls\": 0.79226154088974, \"value_count\": 91, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7133558988571167, \"percentile_inc_nulls\": 0.7917512655258179, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7112720608711243, \"percentile_inc_nulls\": 0.7902373671531677, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7085248827934265, \"percentile_inc_nulls\": 0.7882415056228638, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7064878940582275, \"percentile_inc_nulls\": 0.7867616415023804, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7038031816482544, \"percentile_inc_nulls\": 0.7848111391067505, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 344.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7018129825592041, \"percentile_inc_nulls\": 0.7833652496337891, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.7005017995834351, \"percentile_inc_nulls\": 0.7824127078056335, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6998540163040161, \"percentile_inc_nulls\": 0.7819421291351318, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6979341506958008, \"percentile_inc_nulls\": 0.7805472612380981, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6960376501083374, \"percentile_inc_nulls\": 0.7791694402694702, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6954132914543152, \"percentile_inc_nulls\": 0.7787158489227295, \"value_count\": 80, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6947967410087585, \"percentile_inc_nulls\": 0.7782679200172424, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6917529702186584, \"percentile_inc_nulls\": 0.7760565876960754, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6899501085281372, \"percentile_inc_nulls\": 0.774746835231781, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6875775456428528, \"percentile_inc_nulls\": 0.7730231285095215, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6869922280311584, \"percentile_inc_nulls\": 0.7725979089736938, \"value_count\": 75, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6841045618057251, \"percentile_inc_nulls\": 0.7705000042915344, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 370.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6812559366226196, \"percentile_inc_nulls\": 0.7684304118156433, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 365.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6806939840316772, \"percentile_inc_nulls\": 0.7680221796035767, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6779233813285828, \"percentile_inc_nulls\": 0.7660093307495117, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6762844324111938, \"percentile_inc_nulls\": 0.7648186087608337, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6719763278961182, \"percentile_inc_nulls\": 0.761688768863678, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6687921285629272, \"percentile_inc_nulls\": 0.7593753933906555, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6656546592712402, \"percentile_inc_nulls\": 0.7570960521697998, \"value_count\": 67, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 402.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6635942459106445, \"percentile_inc_nulls\": 0.7555991411209106, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6585213541984558, \"percentile_inc_nulls\": 0.7519136667251587, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 650.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6555244326591492, \"percentile_inc_nulls\": 0.7497363090515137, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6525743007659912, \"percentile_inc_nulls\": 0.7475930452346802, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 378.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6506388187408447, \"percentile_inc_nulls\": 0.7461869120597839, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6477823257446289, \"percentile_inc_nulls\": 0.7441116571426392, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 366.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6449726819992065, \"percentile_inc_nulls\": 0.7420704364776611, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6408284902572632, \"percentile_inc_nulls\": 0.7390596866607666, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 531.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6376599073410034, \"percentile_inc_nulls\": 0.7367576360702515, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6345459222793579, \"percentile_inc_nulls\": 0.7344952821731567, \"value_count\": 57, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6319235563278198, \"percentile_inc_nulls\": 0.7325901985168457, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.62848961353302, \"percentile_inc_nulls\": 0.730095386505127, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 440.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6268038153648376, \"percentile_inc_nulls\": 0.7288706302642822, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6230810880661011, \"percentile_inc_nulls\": 0.7261660099029541, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6214576959609985, \"percentile_inc_nulls\": 0.7249866724014282, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6182734966278076, \"percentile_inc_nulls\": 0.7226732969284058, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.614761471748352, \"percentile_inc_nulls\": 0.7201218008995056, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6124669313430786, \"percentile_inc_nulls\": 0.7184548377990723, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6079715490341187, \"percentile_inc_nulls\": 0.7151888608932495, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 576.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.6028361320495605, \"percentile_inc_nulls\": 0.7114579677581787, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 658.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5931429862976074, \"percentile_inc_nulls\": 0.7044157981872559, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5896309614181519, \"percentile_inc_nulls\": 0.7018643021583557, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5868837237358093, \"percentile_inc_nulls\": 0.6998684406280518, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 352.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5818498134613037, \"percentile_inc_nulls\": 0.6962112784385681, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 645.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.578244149684906, \"percentile_inc_nulls\": 0.6935917139053345, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 462.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5718444585800171, \"percentile_inc_nulls\": 0.6889423131942749, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5674738883972168, \"percentile_inc_nulls\": 0.6857670545578003, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 560.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5622994899749756, \"percentile_inc_nulls\": 0.682007908821106, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 663.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5551818013191223, \"percentile_inc_nulls\": 0.6768368482589722, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 912.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5485401749610901, \"percentile_inc_nulls\": 0.6720116138458252, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 851.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5417970418930054, \"percentile_inc_nulls\": 0.6671127080917358, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 864.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5363339185714722, \"percentile_inc_nulls\": 0.6631436944007874, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.52757728099823, \"percentile_inc_nulls\": 0.6567819118499756, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1122.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5216536521911621, \"percentile_inc_nulls\": 0.6524783372879028, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 759.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5106648802757263, \"percentile_inc_nulls\": 0.6444950103759766, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1408.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.5041325092315674, \"percentile_inc_nulls\": 0.6397491693496704, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 837.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4760362505912781, \"percentile_inc_nulls\": 0.6193370819091797, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3600.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4685673117637634, \"percentile_inc_nulls\": 0.6139108538627625, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 957.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4598262906074524, \"percentile_inc_nulls\": 0.607560396194458, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1120.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4490794539451599, \"percentile_inc_nulls\": 0.5997527837753296, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1377.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.44238317012786865, \"percentile_inc_nulls\": 0.594887912273407, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 858.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.43360310792922974, \"percentile_inc_nulls\": 0.5885090827941895, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1125.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.4227392077445984, \"percentile_inc_nulls\": 0.5806164741516113, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1392.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.32885873317718506, \"percentile_inc_nulls\": 0.5124117136001587, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12029.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.31237560510635376, \"percentile_inc_nulls\": 0.5004366040229797, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2112.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.29729729890823364, \"percentile_inc_nulls\": 0.4894821047782898, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1932.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.28777581453323364, \"percentile_inc_nulls\": 0.48256468772888184, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1220.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.2762095332145691, \"percentile_inc_nulls\": 0.47416168451309204, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1482.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.26216137409210205, \"percentile_inc_nulls\": 0.4639556407928467, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1800.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.2504858374595642, \"percentile_inc_nulls\": 0.45547330379486084, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1496.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.23737424612045288, \"percentile_inc_nulls\": 0.44594764709472656, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1680.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.22718936204910278, \"percentile_inc_nulls\": 0.4385482668876648, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1305.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.20041990280151367, \"percentile_inc_nulls\": 0.4191000461578369, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 3430.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.18875211477279663, \"percentile_inc_nulls\": 0.41062337160110474, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1495.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.1753596067428589, \"percentile_inc_nulls\": 0.40089356899261475, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1716.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.16196703910827637, \"percentile_inc_nulls\": 0.39116382598876953, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1716.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.15135294198989868, \"percentile_inc_nulls\": 0.3834525942802429, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1360.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.13386297225952148, \"percentile_inc_nulls\": 0.37074607610702515, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2241.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.12281179428100586, \"percentile_inc_nulls\": 0.36271733045578003, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 1416.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.02824455499649048, \"percentile_inc_nulls\": 0.2940135598182678, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 12117.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.021314144134521484, \"percentile_inc_nulls\": 0.28897857666015625, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 888.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.014524221420288086, \"percentile_inc_nulls\": 0.2840456962585449, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 870.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.008499085903167725, \"percentile_inc_nulls\": 0.279668390750885, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 772.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.004823207855224609, \"percentile_inc_nulls\": 0.27699780464172363, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 471.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.0018886923789978027, \"percentile_inc_nulls\": 0.27486592531204224, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.27349376678466797, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 6384}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2805, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"sum_tokens_in_value_count_group\": 2805.0, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 48,235 values (27.3%) are null and there are 6384 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2805, \"group_name\": \"_zip_code_\", \"value\": \"33408\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1827, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 956, \"group_name\": \"_zip_code_\", \"value\": \"27517\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 876, \"group_name\": \"_zip_code_\", \"value\": \"01810\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 867, \"group_name\": \"_zip_code_\", \"value\": \"77056\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 697, \"group_name\": \"_zip_code_\", \"value\": \"68154\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 658, \"group_name\": \"_zip_code_\", \"value\": \"28801\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 626, \"group_name\": \"_zip_code_\", \"value\": \"94104\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 588, \"group_name\": \"_zip_code_\", \"value\": \"28202\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 569, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"00222\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"91377\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"44130\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"02321\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"56142\", \"total_non_null_rows\": 128131, \"total_rows_inc_nulls\": 176366, \"distinct_value_count\": 6384}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2805]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9993402361869812, \"percentile_inc_nulls\": 0.9993402361869812, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9982744455337524, \"percentile_inc_nulls\": 0.9982744455337524, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.99776691198349, \"percentile_inc_nulls\": 0.99776691198349, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9973101019859314, \"percentile_inc_nulls\": 0.9973101019859314, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9968787431716919, \"percentile_inc_nulls\": 0.9968787431716919, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9952292442321777, \"percentile_inc_nulls\": 0.9952292442321777, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9943157434463501, \"percentile_inc_nulls\": 0.9943157434463501, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9923617839813232, \"percentile_inc_nulls\": 0.9923617839813232, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9900779128074646, \"percentile_inc_nulls\": 0.9900779128074646, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9873372912406921, \"percentile_inc_nulls\": 0.9873372912406921, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9769837856292725, \"percentile_inc_nulls\": 0.9769837856292725, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9526479840278625, \"percentile_inc_nulls\": 0.9526479840278625, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 959.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9031643867492676, \"percentile_inc_nulls\": 0.9031643867492676, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1950.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.825640082359314, \"percentile_inc_nulls\": 0.825640082359314, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3055.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.6853604316711426, \"percentile_inc_nulls\": 0.6853604316711426, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 5528.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.48103129863739014, \"percentile_inc_nulls\": 0.48103129863739014, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 8052.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.20037049055099487, \"percentile_inc_nulls\": 0.20037049055099487, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 11060.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 7896.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 18658 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 21, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 21, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 20, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 18, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"valley ng power company limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"crossover wind limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lowell limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"mm tomoka farms energy limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"caron garden limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 26]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8926863074302673, \"percentile_inc_nulls\": 0.9180095195770264, \"value_count\": 3231, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3231.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.8061645030975342, \"percentile_inc_nulls\": 0.8519045114517212, \"value_count\": 2605, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2605.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7459479570388794, \"percentile_inc_nulls\": 0.8058974146842957, \"value_count\": 1813, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1813.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.692307710647583, \"percentile_inc_nulls\": 0.764914870262146, \"value_count\": 1615, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1615.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6463066339492798, \"percentile_inc_nulls\": 0.7297688126564026, \"value_count\": 1385, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1385.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6083765029907227, \"percentile_inc_nulls\": 0.700789213180542, \"value_count\": 1142, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1142.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5708781480789185, \"percentile_inc_nulls\": 0.6721394658088684, \"value_count\": 1129, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1129.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5375980138778687, \"percentile_inc_nulls\": 0.6467125415802002, \"value_count\": 1002, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1002.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5072405934333801, \"percentile_inc_nulls\": 0.623518705368042, \"value_count\": 914, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 914.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4776139259338379, \"percentile_inc_nulls\": 0.6008831262588501, \"value_count\": 892, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 892.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4506111145019531, \"percentile_inc_nulls\": 0.5802522301673889, \"value_count\": 813, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 813.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4274279475212097, \"percentile_inc_nulls\": 0.5625396370887756, \"value_count\": 698, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 698.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4053075313568115, \"percentile_inc_nulls\": 0.5456390976905823, \"value_count\": 666, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 666.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.38355255126953125, \"percentile_inc_nulls\": 0.5290176868438721, \"value_count\": 655, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 655.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.36219608783721924, \"percentile_inc_nulls\": 0.5127007961273193, \"value_count\": 643, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 643.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.34253352880477905, \"percentile_inc_nulls\": 0.49767810106277466, \"value_count\": 592, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3235684633255005, \"percentile_inc_nulls\": 0.483188271522522, \"value_count\": 571, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 571.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.30476951599121094, \"percentile_inc_nulls\": 0.4688253402709961, \"value_count\": 566, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 566.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2862362265586853, \"percentile_inc_nulls\": 0.4546654224395752, \"value_count\": 558, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 558.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.26853328943252563, \"percentile_inc_nulls\": 0.4411398768424988, \"value_count\": 533, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.251461386680603, \"percentile_inc_nulls\": 0.4280965328216553, \"value_count\": 514, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 514.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2351534366607666, \"percentile_inc_nulls\": 0.4156368374824524, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.21914440393447876, \"percentile_inc_nulls\": 0.40340548753738403, \"value_count\": 482, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 482.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2044306993484497, \"percentile_inc_nulls\": 0.39216381311416626, \"value_count\": 443, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 443.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.19051414728164673, \"percentile_inc_nulls\": 0.3815311789512634, \"value_count\": 419, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 419.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.17686331272125244, \"percentile_inc_nulls\": 0.3711015582084656, \"value_count\": 411, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 411.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.16444134712219238, \"percentile_inc_nulls\": 0.36161088943481445, \"value_count\": 374, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1528165340423584, \"percentile_inc_nulls\": 0.3527292013168335, \"value_count\": 350, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1417563557624817, \"percentile_inc_nulls\": 0.3442789316177368, \"value_count\": 333, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1311279535293579, \"percentile_inc_nulls\": 0.3361585736274719, \"value_count\": 320, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.12076526880264282, \"percentile_inc_nulls\": 0.32824116945266724, \"value_count\": 312, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.110801100730896, \"percentile_inc_nulls\": 0.32062828540802, \"value_count\": 300, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.10096985101699829, \"percentile_inc_nulls\": 0.31311696767807007, \"value_count\": 296, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.09253352880477905, \"percentile_inc_nulls\": 0.3066713809967041, \"value_count\": 254, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.08442938327789307, \"percentile_inc_nulls\": 0.30047959089279175, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.07685667276382446, \"percentile_inc_nulls\": 0.29469382762908936, \"value_count\": 228, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.06257474422454834, \"percentile_inc_nulls\": 0.28378206491470337, \"value_count\": 215, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.05616450309753418, \"percentile_inc_nulls\": 0.2788844704627991, \"value_count\": 193, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.050019919872283936, \"percentile_inc_nulls\": 0.27418988943099976, \"value_count\": 185, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.04483860731124878, \"percentile_inc_nulls\": 0.2702311873435974, \"value_count\": 156, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0397568941116333, \"percentile_inc_nulls\": 0.26634860038757324, \"value_count\": 153, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.03484123945236206, \"percentile_inc_nulls\": 0.2625929117202759, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.030324161052703857, \"percentile_inc_nulls\": 0.25914180278778076, \"value_count\": 136, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.025873541831970215, \"percentile_inc_nulls\": 0.2557413578033447, \"value_count\": 134, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.02172178030014038, \"percentile_inc_nulls\": 0.252569317817688, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.017868995666503906, \"percentile_inc_nulls\": 0.24962568283081055, \"value_count\": 116, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.014149069786071777, \"percentile_inc_nulls\": 0.2467835545539856, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.010728061199188232, \"percentile_inc_nulls\": 0.24416983127593994, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.008037745952606201, \"percentile_inc_nulls\": 0.2421143651008606, \"value_count\": 81, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0053806304931640625, \"percentile_inc_nulls\": 0.2400842308998108, \"value_count\": 80, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0031552910804748535, \"percentile_inc_nulls\": 0.23838406801223755, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.001926422119140625, \"percentile_inc_nulls\": 0.23744511604309082, \"value_count\": 37, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 37.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0011292695999145508, \"percentile_inc_nulls\": 0.23683607578277588, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.000597834587097168, \"percentile_inc_nulls\": 0.23643004894256592, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0003985762596130371, \"percentile_inc_nulls\": 0.23627781867980957, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 6.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.000298917293548584, \"percentile_inc_nulls\": 0.2362017035484314, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 9.965896606445312e-05, \"percentile_inc_nulls\": 0.23604941368103027, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 6.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.2359732985496521, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3231, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3231.0, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 9,299 values (23.6%) are null and there are 63 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3231, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2605, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1813, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1615, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1385, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1142, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1129, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1002, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 914, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 892, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"mp\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3231]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9577498435974121, \"percentile_inc_nulls\": 0.9728982448577881, \"value_count\": 1068, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.9291083216667175, \"percentile_inc_nulls\": 0.9545258283615112, \"value_count\": 724, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 724.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.9056096076965332, \"percentile_inc_nulls\": 0.9394524097442627, \"value_count\": 594, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8830999135971069, \"percentile_inc_nulls\": 0.9250133037567139, \"value_count\": 569, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.862963855266571, \"percentile_inc_nulls\": 0.9120968580245972, \"value_count\": 509, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 509.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8452804684638977, \"percentile_inc_nulls\": 0.9007536768913269, \"value_count\": 447, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 447.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8341245651245117, \"percentile_inc_nulls\": 0.8935976028442383, \"value_count\": 282, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8247883319854736, \"percentile_inc_nulls\": 0.8876088261604309, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8160851001739502, \"percentile_inc_nulls\": 0.8820260167121887, \"value_count\": 220, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8079752922058105, \"percentile_inc_nulls\": 0.876823902130127, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8000237345695496, \"percentile_inc_nulls\": 0.8717232942581177, \"value_count\": 201, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7921908497810364, \"percentile_inc_nulls\": 0.8666988015174866, \"value_count\": 198, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7845557332038879, \"percentile_inc_nulls\": 0.8618012070655823, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7701558470726013, \"percentile_inc_nulls\": 0.85256427526474, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.763351559638977, \"percentile_inc_nulls\": 0.8481995463371277, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7567845582962036, \"percentile_inc_nulls\": 0.8439871072769165, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7502967119216919, \"percentile_inc_nulls\": 0.8398253917694092, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7442044019699097, \"percentile_inc_nulls\": 0.8359174728393555, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7384287118911743, \"percentile_inc_nulls\": 0.8322125673294067, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7328506708145142, \"percentile_inc_nulls\": 0.82863450050354, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7275892496109009, \"percentile_inc_nulls\": 0.8252594470977783, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7224463820457458, \"percentile_inc_nulls\": 0.8219605684280396, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7176991701126099, \"percentile_inc_nulls\": 0.8189154267311096, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7130706310272217, \"percentile_inc_nulls\": 0.8159464001655579, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.703892707824707, \"percentile_inc_nulls\": 0.810059130191803, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6993433237075806, \"percentile_inc_nulls\": 0.8071408271789551, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6948334574699402, \"percentile_inc_nulls\": 0.8042479753494263, \"value_count\": 114, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6904423236846924, \"percentile_inc_nulls\": 0.8014312386512756, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6862488985061646, \"percentile_inc_nulls\": 0.798741340637207, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6779413223266602, \"percentile_inc_nulls\": 0.7934123277664185, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6738666296005249, \"percentile_inc_nulls\": 0.7907986044883728, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6698315143585205, \"percentile_inc_nulls\": 0.7882102131843567, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6658754348754883, \"percentile_inc_nulls\": 0.7856726050376892, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6581217050552368, \"percentile_inc_nulls\": 0.7806988954544067, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.654323935508728, \"percentile_inc_nulls\": 0.778262734413147, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6508030891418457, \"percentile_inc_nulls\": 0.7760042548179626, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6473218202590942, \"percentile_inc_nulls\": 0.7737711668014526, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6440383195877075, \"percentile_inc_nulls\": 0.7716649174690247, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6409130096435547, \"percentile_inc_nulls\": 0.769660234451294, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6378273963928223, \"percentile_inc_nulls\": 0.7676808834075928, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.634820818901062, \"percentile_inc_nulls\": 0.7657522559165955, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6318933367729187, \"percentile_inc_nulls\": 0.7638744115829468, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6261175870895386, \"percentile_inc_nulls\": 0.760169506072998, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6205000281333923, \"percentile_inc_nulls\": 0.7565661072731018, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6178890466690063, \"percentile_inc_nulls\": 0.7548912763595581, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6128253936767578, \"percentile_inc_nulls\": 0.7516431212425232, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6103330850601196, \"percentile_inc_nulls\": 0.7500444054603577, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.607959508895874, \"percentile_inc_nulls\": 0.7485218048095703, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.605823278427124, \"percentile_inc_nulls\": 0.7471514940261841, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6037265658378601, \"percentile_inc_nulls\": 0.7458065748214722, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5997705459594727, \"percentile_inc_nulls\": 0.7432689666748047, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5978716611862183, \"percentile_inc_nulls\": 0.7420508861541748, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5922936797142029, \"percentile_inc_nulls\": 0.7384728193283081, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5904738903045654, \"percentile_inc_nulls\": 0.7373055219650269, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5835113525390625, \"percentile_inc_nulls\": 0.7328393459320068, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5784081220626831, \"percentile_inc_nulls\": 0.7295657992362976, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5750850439071655, \"percentile_inc_nulls\": 0.7274342179298401, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5734630823135376, \"percentile_inc_nulls\": 0.7263938188552856, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5655510425567627, \"percentile_inc_nulls\": 0.7213185429573059, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5609225034713745, \"percentile_inc_nulls\": 0.7183495759963989, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5594192743301392, \"percentile_inc_nulls\": 0.7173852324485779, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 38.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.552100658416748, \"percentile_inc_nulls\": 0.7126906514167786, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5464040040969849, \"percentile_inc_nulls\": 0.7090364694595337, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5436347723007202, \"percentile_inc_nulls\": 0.7072601318359375, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5395996570587158, \"percentile_inc_nulls\": 0.7046717405319214, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5356832146644592, \"percentile_inc_nulls\": 0.7021595239639282, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5293536186218262, \"percentile_inc_nulls\": 0.6980993151664734, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5232217311859131, \"percentile_inc_nulls\": 0.6941660046577454, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5196613669395447, \"percentile_inc_nulls\": 0.6918821334838867, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5139251947402954, \"percentile_inc_nulls\": 0.6882026195526123, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5072790384292603, \"percentile_inc_nulls\": 0.6839393973350525, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5051428079605103, \"percentile_inc_nulls\": 0.6825690865516663, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5030856728553772, \"percentile_inc_nulls\": 0.6812494993209839, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.49814069271087646, \"percentile_inc_nulls\": 0.6780774593353271, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4943429231643677, \"percentile_inc_nulls\": 0.6756414175033569, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4888836145401001, \"percentile_inc_nulls\": 0.6721394658088684, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4819210171699524, \"percentile_inc_nulls\": 0.6676732301712036, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4719519019126892, \"percentile_inc_nulls\": 0.661278486251831, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.46562230587005615, \"percentile_inc_nulls\": 0.6572182178497314, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4626157283782959, \"percentile_inc_nulls\": 0.6552896499633789, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.45122241973876953, \"percentile_inc_nulls\": 0.6479812860488892, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4418070912361145, \"percentile_inc_nulls\": 0.6419417858123779, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.43357861042022705, \"percentile_inc_nulls\": 0.636663556098938, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4223039746284485, \"percentile_inc_nulls\": 0.6294313073158264, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4117810130119324, \"percentile_inc_nulls\": 0.6226812601089478, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.402523934841156, \"percentile_inc_nulls\": 0.6167432069778442, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3925548195838928, \"percentile_inc_nulls\": 0.6103484034538269, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.37775933742523193, \"percentile_inc_nulls\": 0.6008577346801758, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.36153966188430786, \"percentile_inc_nulls\": 0.5904535055160522, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3426694869995117, \"percentile_inc_nulls\": 0.5783489942550659, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3220982551574707, \"percentile_inc_nulls\": 0.5651533603668213, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 520.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3018830418586731, \"percentile_inc_nulls\": 0.552186131477356, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.27696019411087036, \"percentile_inc_nulls\": 0.536199152469635, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 630.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.234433114528656, \"percentile_inc_nulls\": 0.5089197158813477, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1075.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.18537861108779907, \"percentile_inc_nulls\": 0.47745323181152344, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1240.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.13161641359329224, \"percentile_inc_nulls\": 0.4429669976234436, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1359.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.07259279489517212, \"percentile_inc_nulls\": 0.40510571002960205, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1492.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.3585403561592102, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1835.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1068, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14,129 values (35.9%) are null and there are 4225 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1068, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 724, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 594, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 569, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 509, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 447, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 282, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 236, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 220, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 205, \"group_name\": \"_city_\", \"value\": \"omaha\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"stafford springs\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"frisco\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"forth worth\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"tooele\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"munster\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 1068]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9842325448989868, \"percentile_inc_nulls\": 0.9920572638511658, \"value_count\": 313, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9770288467407227, \"percentile_inc_nulls\": 0.9884284734725952, \"value_count\": 143, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9700770974159241, \"percentile_inc_nulls\": 0.9849265217781067, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9637297987937927, \"percentile_inc_nulls\": 0.9817291498184204, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9592967629432678, \"percentile_inc_nulls\": 0.9794960021972656, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9553171396255493, \"percentile_inc_nulls\": 0.9774913191795349, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9518412351608276, \"percentile_inc_nulls\": 0.975740373134613, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9484156966209412, \"percentile_inc_nulls\": 0.9740147590637207, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9450909495353699, \"percentile_inc_nulls\": 0.972339928150177, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9419676661491394, \"percentile_inc_nulls\": 0.9707666039466858, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9389955401420593, \"percentile_inc_nulls\": 0.9692693948745728, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9361745119094849, \"percentile_inc_nulls\": 0.9678483605384827, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9334542155265808, \"percentile_inc_nulls\": 0.9664780497550964, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9307843446731567, \"percentile_inc_nulls\": 0.9651330709457397, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9256460666656494, \"percentile_inc_nulls\": 0.9625447392463684, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9232280254364014, \"percentile_inc_nulls\": 0.9613266587257385, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9184927940368652, \"percentile_inc_nulls\": 0.9589412808418274, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9161754846572876, \"percentile_inc_nulls\": 0.9577739834785461, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9139086008071899, \"percentile_inc_nulls\": 0.9566320776939392, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.909475564956665, \"percentile_inc_nulls\": 0.9543989896774292, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9055463075637817, \"percentile_inc_nulls\": 0.952419638633728, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8998035192489624, \"percentile_inc_nulls\": 0.9495267271995544, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8960757851600647, \"percentile_inc_nulls\": 0.9476488828659058, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8925495147705078, \"percentile_inc_nulls\": 0.9458725452423096, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8891239762306213, \"percentile_inc_nulls\": 0.944146990776062, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8874616026878357, \"percentile_inc_nulls\": 0.9433095455169678, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8842375874519348, \"percentile_inc_nulls\": 0.9416854977607727, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8811143040657043, \"percentile_inc_nulls\": 0.9401121735572815, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8796030282974243, \"percentile_inc_nulls\": 0.9393509030342102, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.875371515750885, \"percentile_inc_nulls\": 0.9372192621231079, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8644904494285583, \"percentile_inc_nulls\": 0.931738018989563, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.860561192035675, \"percentile_inc_nulls\": 0.9297586679458618, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8530048727989197, \"percentile_inc_nulls\": 0.9259522557258606, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8469598293304443, \"percentile_inc_nulls\": 0.9229071140289307, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8434839248657227, \"percentile_inc_nulls\": 0.9211561679840088, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8390509486198425, \"percentile_inc_nulls\": 0.918923020362854, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8337615132331848, \"percentile_inc_nulls\": 0.9162585139274597, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8297314643859863, \"percentile_inc_nulls\": 0.9142284393310547, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8259029984474182, \"percentile_inc_nulls\": 0.9122998714447021, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8213691711425781, \"percentile_inc_nulls\": 0.9100160002708435, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8119490146636963, \"percentile_inc_nulls\": 0.9052706360816956, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8046950101852417, \"percentile_inc_nulls\": 0.9016164541244507, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.799405574798584, \"percentile_inc_nulls\": 0.8989519476890564, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7923530340194702, \"percentile_inc_nulls\": 0.895399272441864, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7831847071647644, \"percentile_inc_nulls\": 0.8907808065414429, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7716991901397705, \"percentile_inc_nulls\": 0.8849950432777405, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7633872032165527, \"percentile_inc_nulls\": 0.8808079957962036, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7528084516525269, \"percentile_inc_nulls\": 0.875478982925415, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7396605014801025, \"percentile_inc_nulls\": 0.8688557744026184, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7235403656959534, \"percentile_inc_nulls\": 0.8607354164123535, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7062616348266602, \"percentile_inc_nulls\": 0.8520313501358032, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6808724999427795, \"percentile_inc_nulls\": 0.8392417430877686, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6536698341369629, \"percentile_inc_nulls\": 0.8255386352539062, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6141756176948547, \"percentile_inc_nulls\": 0.8056436777114868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 784.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.559921383857727, \"percentile_inc_nulls\": 0.7783135175704956, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1077.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.4305576682090759, \"percentile_inc_nulls\": 0.7131474018096924, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2568.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.4962570071220398, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 8547.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 313, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 19,556 values (49.6%) are null and there are 10892 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 313, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 143, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st, suite 200\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 138, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd.\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 126, \"group_name\": \"_street_address_\", \"value\": \"130 roberts street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 88, \"group_name\": \"_street_address_\", \"value\": \"333 washington street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 69, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave., 35th fl.\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 68, \"group_name\": \"_street_address_\", \"value\": \"101 summer street, 2nd floor\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"14302 fnb parkway\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 62, \"group_name\": \"_street_address_\", \"value\": \"66 york street, 5th floor\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"14700 downey avenue\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"80 vandenburgh avenue\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"32982 road 80\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"401 f street, nw\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"9600 sw barnes rd\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 313]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9763555526733398, \"percentile_inc_nulls\": 0.9850280284881592, \"value_count\": 590, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9581212401390076, \"percentile_inc_nulls\": 0.9734818935394287, \"value_count\": 455, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9498657584190369, \"percentile_inc_nulls\": 0.9682543873786926, \"value_count\": 206, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9417705535888672, \"percentile_inc_nulls\": 0.963128387928009, \"value_count\": 202, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9337554574012756, \"percentile_inc_nulls\": 0.9580531120300293, \"value_count\": 200, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9258606433868408, \"percentile_inc_nulls\": 0.9530540108680725, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9186069965362549, \"percentile_inc_nulls\": 0.9484609365463257, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9114735722541809, \"percentile_inc_nulls\": 0.943943977355957, \"value_count\": 178, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9044603705406189, \"percentile_inc_nulls\": 0.9395031332969666, \"value_count\": 175, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8975273370742798, \"percentile_inc_nulls\": 0.9351130723953247, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8906744718551636, \"percentile_inc_nulls\": 0.9307737350463867, \"value_count\": 171, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8843024969100952, \"percentile_inc_nulls\": 0.9267389178276062, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.878010630607605, \"percentile_inc_nulls\": 0.9227548241615295, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8724802732467651, \"percentile_inc_nulls\": 0.9192529320716858, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8674708604812622, \"percentile_inc_nulls\": 0.916080892086029, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8627419471740723, \"percentile_inc_nulls\": 0.9130865335464478, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8580932021141052, \"percentile_inc_nulls\": 0.9101428985595703, \"value_count\": 116, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8498376607894897, \"percentile_inc_nulls\": 0.9049153923988342, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8458702564239502, \"percentile_inc_nulls\": 0.9024031162261963, \"value_count\": 99, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8419428467750549, \"percentile_inc_nulls\": 0.8999162912368774, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8380956053733826, \"percentile_inc_nulls\": 0.8974801301956177, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8343285322189331, \"percentile_inc_nulls\": 0.8950947523117065, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8306015133857727, \"percentile_inc_nulls\": 0.8927347660064697, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.826994776725769, \"percentile_inc_nulls\": 0.8904509544372559, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8234280347824097, \"percentile_inc_nulls\": 0.8881924748420715, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8201418519020081, \"percentile_inc_nulls\": 0.8861116170883179, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8169759511947632, \"percentile_inc_nulls\": 0.8841068744659424, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8108043074607849, \"percentile_inc_nulls\": 0.8801989555358887, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8078387379646301, \"percentile_inc_nulls\": 0.87832111120224, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8049933910369873, \"percentile_inc_nulls\": 0.8765193819999695, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8021880984306335, \"percentile_inc_nulls\": 0.8747430443763733, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7971386313438416, \"percentile_inc_nulls\": 0.871545672416687, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7946539521217346, \"percentile_inc_nulls\": 0.8699723482131958, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7900052070617676, \"percentile_inc_nulls\": 0.8670287132263184, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7855969071388245, \"percentile_inc_nulls\": 0.8642373085021973, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7792249321937561, \"percentile_inc_nulls\": 0.8602024912834167, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7729731798171997, \"percentile_inc_nulls\": 0.8562438488006592, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7709293365478516, \"percentile_inc_nulls\": 0.8549495935440063, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7669218182563782, \"percentile_inc_nulls\": 0.8524119853973389, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7629944086074829, \"percentile_inc_nulls\": 0.84992516040802, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7591471672058105, \"percentile_inc_nulls\": 0.8474889993667603, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7553800940513611, \"percentile_inc_nulls\": 0.8451036810874939, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7535366415977478, \"percentile_inc_nulls\": 0.8439363241195679, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7481265068054199, \"percentile_inc_nulls\": 0.8405105471611023, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7445998191833496, \"percentile_inc_nulls\": 0.8382774591445923, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.741153359413147, \"percentile_inc_nulls\": 0.8360950946807861, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7394702434539795, \"percentile_inc_nulls\": 0.8350293040275574, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7361840009689331, \"percentile_inc_nulls\": 0.8329484462738037, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.731374979019165, \"percentile_inc_nulls\": 0.8299033045768738, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7282490730285645, \"percentile_inc_nulls\": 0.8279239535331726, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7236804962158203, \"percentile_inc_nulls\": 0.8250311017036438, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7133010029792786, \"percentile_inc_nulls\": 0.8184586763381958, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.70753014087677, \"percentile_inc_nulls\": 0.8148044943809509, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6963090896606445, \"percentile_inc_nulls\": 0.8076991438865662, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6935839653015137, \"percentile_inc_nulls\": 0.8059735298156738, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6896164417266846, \"percentile_inc_nulls\": 0.8034613132476807, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6832044124603271, \"percentile_inc_nulls\": 0.7994011044502258, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6807197332382202, \"percentile_inc_nulls\": 0.7978277802467346, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6771129369735718, \"percentile_inc_nulls\": 0.795543909072876, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6724642515182495, \"percentile_inc_nulls\": 0.7926002740859985, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.669097900390625, \"percentile_inc_nulls\": 0.790468692779541, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6647697687149048, \"percentile_inc_nulls\": 0.7877280712127686, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6574760675430298, \"percentile_inc_nulls\": 0.7831096053123474, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6504628658294678, \"percentile_inc_nulls\": 0.7786687612533569, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6418066024780273, \"percentile_inc_nulls\": 0.773187518119812, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6353544592857361, \"percentile_inc_nulls\": 0.7691019177436829, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6318278312683105, \"percentile_inc_nulls\": 0.7668688297271729, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6242536306381226, \"percentile_inc_nulls\": 0.762072741985321, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6194445490837097, \"percentile_inc_nulls\": 0.7590276002883911, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.61411452293396, \"percentile_inc_nulls\": 0.7556525468826294, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6112291216850281, \"percentile_inc_nulls\": 0.7538254261016846, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6030537486076355, \"percentile_inc_nulls\": 0.7486487030982971, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5966416597366333, \"percentile_inc_nulls\": 0.7445884943008423, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.587624728679657, \"percentile_inc_nulls\": 0.7388788461685181, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5758426189422607, \"percentile_inc_nulls\": 0.731418251991272, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5669859647750854, \"percentile_inc_nulls\": 0.725810170173645, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5568869113922119, \"percentile_inc_nulls\": 0.7194153070449829, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5414579510688782, \"percentile_inc_nulls\": 0.7096455097198486, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5262293219566345, \"percentile_inc_nulls\": 0.7000025510787964, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5081954002380371, \"percentile_inc_nulls\": 0.688583254814148, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.48639440536499023, \"percentile_inc_nulls\": 0.6747785806655884, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.4659159183502197, \"percentile_inc_nulls\": 0.661811351776123, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.4298481345176697, \"percentile_inc_nulls\": 0.6389727592468262, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.3877689838409424, \"percentile_inc_nulls\": 0.6123277544975281, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.3178775906562805, \"percentile_inc_nulls\": 0.5680716633796692, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1744.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.2271069884300232, \"percentile_inc_nulls\": 0.5105946063995361, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2265.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.11842262744903564, \"percentile_inc_nulls\": 0.44177430868148804, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2712.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.3667876124382019, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2955.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 590, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 14,454 values (36.7%) are null and there are 6401 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 590, \"group_name\": \"_zip_code_\", \"value\": \"33408\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 455, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 206, \"group_name\": \"_zip_code_\", \"value\": \"01810\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 202, \"group_name\": \"_zip_code_\", \"value\": \"28801\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 200, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 197, \"group_name\": \"_zip_code_\", \"value\": \"27517\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 181, \"group_name\": \"_zip_code_\", \"value\": \"77056\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 178, \"group_name\": \"_zip_code_\", \"value\": \"07302\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 175, \"group_name\": \"_zip_code_\", \"value\": \"02110\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 173, \"group_name\": \"_zip_code_\", \"value\": \"37201\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"06076\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"10029\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"93206\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"08536\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"86301\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 590]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 211,
+     "execution_count": 132,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1916,34 +1922,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 300,
+   "execution_count": 66,
    "id": "6402e556-b87c-47ca-bc30-ced2b42e6626",
    "metadata": {},
    "outputs": [],
    "source": [
-    "br0 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n",
-    "br1 = \"l.report_year = r.report_year and l.street_address = r.street_address\"\n",
-    "# br2 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city\"\n",
-    "br4 = \"l.report_year = r.report_year and l.phone_number = r.phone_number\""
+    "# probably shouldn't be blocking on report year, because we don't care that much \n",
+    "# about report year lining up\n",
+    "# try overlap between tokens in address or company name\n",
+    "br0 = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"\n",
+    "br1 = \"l.street_address = r.street_address\"\n",
+    "br2 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\"\n",
+    "# br3 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.zip_code = r.zip_code\"\n",
+    "br3 = \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 257,
+   "execution_count": 67,
    "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'number_of_comparisons_generated_pre_filter_conditions': 618634,\n",
-       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 618634,\n",
+       "{'number_of_comparisons_generated_pre_filter_conditions': 988101,\n",
+       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 988101,\n",
        " 'filter_conditions_identified': '',\n",
-       " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
+       " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
        " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
       ]
      },
-     "execution_count": 257,
+     "execution_count": 67,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1962,7 +1972,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 259,
+   "execution_count": 68,
    "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
    "metadata": {},
    "outputs": [
@@ -1988,8 +1998,6 @@
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>key_0</th>\n",
-       "      <th>key_1</th>\n",
-       "      <th>key_2</th>\n",
        "      <th>count_l</th>\n",
        "      <th>count_r</th>\n",
        "      <th>block_count</th>\n",
@@ -1998,43 +2006,37 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2023</td>\n",
-       "      <td>boston</td>\n",
-       "      <td>02110</td>\n",
-       "      <td>113</td>\n",
-       "      <td>134</td>\n",
-       "      <td>15142</td>\n",
+       "      <td>AMRK</td>\n",
+       "      <td>888</td>\n",
+       "      <td>85</td>\n",
+       "      <td>75480</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2022</td>\n",
-       "      <td>boston</td>\n",
-       "      <td>02110</td>\n",
-       "      <td>116</td>\n",
-       "      <td>110</td>\n",
-       "      <td>12760</td>\n",
+       "      <td>INTR</td>\n",
+       "      <td>468</td>\n",
+       "      <td>157</td>\n",
+       "      <td>73476</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>2021</td>\n",
-       "      <td>boston</td>\n",
-       "      <td>02110</td>\n",
-       "      <td>113</td>\n",
-       "      <td>88</td>\n",
-       "      <td>9944</td>\n",
+       "      <td>FRST</td>\n",
+       "      <td>836</td>\n",
+       "      <td>82</td>\n",
+       "      <td>68552</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   key_0   key_1  key_2  count_l  count_r  block_count\n",
-       "0   2023  boston  02110      113      134        15142\n",
-       "1   2022  boston  02110      116      110        12760\n",
-       "2   2021  boston  02110      113       88         9944"
+       "  key_0  count_l  count_r  block_count\n",
+       "0  AMRK      888       85        75480\n",
+       "1  INTR      468      157        73476\n",
+       "2  FRST      836       82        68552"
       ]
      },
-     "execution_count": 259,
+     "execution_count": 68,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2042,7 +2044,7 @@
    "source": [
     "result = n_largest_blocks(\n",
     "    table_or_tables=[sec_match_df, eia_match_df],\n",
-    "    blocking_rule=br3,\n",
+    "    blocking_rule=br0,\n",
     "    link_type=\"link_only\",\n",
     "    db_api=db_api,\n",
     "    n_largest=3\n",
@@ -2053,32 +2055,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 302,
+   "execution_count": 69,
    "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bf1ed000055946dcbdc2d64e635de891",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed {\n",
+       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed details,\n",
-       "  #altair-viz-4929eafdbb4f44fb9220f865b54fe3cc.vega-embed details summary {\n",
+       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed details,\n",
+       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\"></div>\n",
+       "<div id=\"altair-viz-7144afd26472470d8fe5764a8949ebb8\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-4929eafdbb4f44fb9220f865b54fe3cc\");\n",
+       "    if (outputDiv.id !== \"altair-viz-7144afd26472470d8fe5764a8949ebb8\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7144afd26472470d8fe5764a8949ebb8\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2124,21 +2140,21 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-bc6bb82997e900308036d5ce309e7401\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-bc6bb82997e900308036d5ce309e7401\": [{\"blocking_rule\": \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\", \"row_count\": 2371618, \"cumulative_rows\": 2371618, \"cartesian\": 40620617120, \"match_key\": \"0\", \"start\": 0}, {\"blocking_rule\": \"l.report_year = r.report_year and l.street_address = r.street_address\", \"row_count\": 7101, \"cumulative_rows\": 2378719, \"cartesian\": 40620617120, \"match_key\": \"1\", \"start\": 2371618}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-8fc653ccc17479a7e2943968c5585e30\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-8fc653ccc17479a7e2943968c5585e30\": [{\"blocking_rule\": \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\", \"row_count\": 988101, \"cumulative_rows\": 988101, \"cartesian\": 2542342605, \"match_key\": \"0\", \"start\": 0}, {\"blocking_rule\": \"l.street_address = r.street_address\", \"row_count\": 9184, \"cumulative_rows\": 997285, \"cartesian\": 2542342605, \"match_key\": \"1\", \"start\": 988101}, {\"blocking_rule\": \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\", \"row_count\": 13507, \"cumulative_rows\": 1010792, \"cartesian\": 2542342605, \"match_key\": \"2\", \"start\": 997285}, {\"blocking_rule\": \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\", \"row_count\": 27665, \"cumulative_rows\": 1038457, \"cartesian\": 2542342605, \"match_key\": \"3\", \"start\": 1010792}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.Chart(...)"
       ]
      },
-     "execution_count": 302,
+     "execution_count": 69,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "blocking_rules_for_analysis = [\n",
-    "    br0, br1\n",
+    "    br0, br1, br2, br3\n",
     "]\n",
     "\n",
     "\n",
@@ -2161,7 +2177,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 382,
+   "execution_count": 334,
+   "id": "cb8b02b2-50a1-4525-9516-eecdf9a145db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOT USED\n",
+    "company_name_comparison = cl.CustomComparison(\n",
+    "    comparison_levels = [\n",
+    "        cll.NullLevel(\"company_name\"),\n",
+    "        cll.ExactMatchLevel(\"company_name\"),\n",
+    "        # cll.ExactMatchLevel(\"company_name_no_legal\"),\n",
+    "        # cll.LevenshteinLevel(\"company_name\", distance_threshold=1),\n",
+    "        cll.JaroWinklerLevel(\"company_name_no_legal\", distance_threshold=.95),\n",
+    "        # cll.ArraySubsetLevel(\"company_name_mphone_list\"),\n",
+    "        cll.ArrayIntersectLevel(\"company_name_mphone_list\", min_intersection=3)\n",
+    "    ],\n",
+    "    output_column_name=\"company_name\",\n",
+    "    comparison_description=None\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 386,
    "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
    "metadata": {},
    "outputs": [
@@ -2169,32 +2208,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Comparison 'JaccardAtThresholds' of \"company_name\".\n",
+      "Comparison 'NameComparison' of \"company_name_no_legal\".\n",
       "Similarity is assessed using the following ComparisonLevels:\n",
-      "    - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n",
-      "    - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n",
-      "    - 'Jaccard distance of 'company_name >= 0.9'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.9\n",
-      "    - 'Jaccard distance of 'company_name >= 0.7'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.7\n",
+      "    - 'company_name_no_legal is NULL' with SQL rule: \"company_name_no_legal_l\" IS NULL OR \"company_name_no_legal_r\" IS NULL\n",
+      "    - 'Exact match on company_name_no_legal' with SQL rule: \"company_name_no_legal_l\" = \"company_name_no_legal_r\"\n",
+      "    - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity(\"company_name_no_legal_l\", \"company_name_no_legal_r\") >= 0.95\n",
       "    - 'All other comparisons' with SQL rule: ELSE\n",
       "\n"
      ]
     }
    ],
    "source": [
-    "# company_name_comparison = cl.NameComparison(\n",
-    "#     \"company_name\",\n",
-    "    # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n",
-    "# )\n",
+    "company_name_comparison = cl.NameComparison(\n",
+    "    \"company_name_no_legal\",\n",
+    "    jaro_winkler_thresholds=[.95],\n",
+    ")\n",
+    "\"\"\"\n",
     "company_name_comparison = cl.JaccardAtThresholds(\n",
     "     \"company_name\",\n",
     "    # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n",
     ")\n",
+    "\"\"\"\n",
     "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 373,
+   "execution_count": 449,
    "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
    "metadata": {},
    "outputs": [
@@ -2207,7 +2247,6 @@
       "    - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n",
       "    - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n",
       "    - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n",
-      "    - 'Levenshtein distance of street_address <= 2' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 2\n",
       "    - 'All other comparisons' with SQL rule: ELSE\n",
       "\n"
      ]
@@ -2216,24 +2255,45 @@
    "source": [
     "address_comparison = cl.LevenshteinAtThresholds(\n",
     "    \"street_address\",\n",
-    "    # size_threshold_or_thresholds=[1,2,3]\n",
-    ")\n",
+    "    distance_threshold_or_thresholds=[1]\n",
+    ").configure(term_frequency_adjustments=True)\n",
     "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 267,
+   "execution_count": 422,
+   "id": "d2e043ed-7f64-4547-992d-7f947a63db6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOT USED\n",
+    "address_comparison = cl.CustomComparison(\n",
+    "    comparison_levels = [\n",
+    "        cll.NullLevel(\"street_address\"),\n",
+    "        cll.ExactMatchLevel(\"street_address\"),\n",
+    "        cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n",
+    "        cll.ArraySubsetLevel(\"street_address_list\"),\n",
+    "    ],\n",
+    "    output_column_name=\"street_address\",\n",
+    "    comparison_description=None\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 388,
    "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607",
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Use state and city instead of zip code\n",
     "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 268,
+   "execution_count": 450,
    "id": "974a3982-38a1-45cb-9875-b8d4584c808d",
    "metadata": {},
    "outputs": [],
@@ -2243,7 +2303,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 269,
+   "execution_count": 451,
    "id": "7592619b-340a-4496-8195-9ce932cae699",
    "metadata": {},
    "outputs": [
@@ -2265,14 +2325,13 @@
     "city_comparison = cl.NameComparison(\n",
     "    \"city\",\n",
     "    jaro_winkler_thresholds=[0.9]\n",
-    "    # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n",
     ")\n",
     "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 383,
+   "execution_count": 452,
    "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
    "metadata": {},
    "outputs": [],
@@ -2283,12 +2342,12 @@
     "    comparisons=[\n",
     "        company_name_comparison,\n",
     "        address_comparison,\n",
-    "        zip_code_comparison,\n",
+    "        # zip_code_comparison,\n",
     "        state_comparison,\n",
     "        city_comparison\n",
     "    ],\n",
     "    blocking_rules_to_generate_predictions=[\n",
-    "        br0, br1\n",
+    "        br0, br1, br2, br3\n",
     "    ],\n",
     "    retain_intermediate_calculation_columns=True,\n",
     ")\n",
@@ -2298,48 +2357,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 384,
+   "execution_count": 453,
    "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "358d0a088e2441deaef798c55ad97068",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Probability two random records match is estimated to be  2.18e-05.\n",
-      "This means that amongst all possible pairwise record comparisons, one in 45,828.17 are expected to match.  With 40,620,617,120 total possible comparisons, we expect a total of around 886,367.78 matching pairs\n"
+      "Probability two random records match is estimated to be  1.78e-06.\n",
+      "This means that amongst all possible pairwise record comparisons, one in 562,858.42 are expected to match.  With 2,542,342,605 total possible comparisons, we expect a total of around 4,516.84 matching pairs\n"
      ]
     }
    ],
    "source": [
     "deterministic_rules = [\n",
-    "    block_on(\"company_name\", \"company_name\"),\n",
-    "    block_on(\"phone_number\"),\n",
-    "    block_on(\"street_address\"),\n",
-    "    \"jaccard(r.company_name, l.company_name) >= .9 and l.city = r.city\",\n",
-    "    \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city\",\n",
+    "    block_on(\"company_name_mphone\", \"company_name_mphone\"),\n",
+    "    # block_on(\"street_address\"),\n",
+    "    \"jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city\",\n",
+    "    # \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and jaccard(r.street_address, l.street_address) >= .9\",\n",
+    "    \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address\",\n",
     "]\n",
     "\n",
-    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)"
+    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 385,
+   "execution_count": null,
    "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
    "metadata": {},
    "outputs": [
@@ -2347,26 +2392,17 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "----- Estimating u probabilities using random sampling -----\n",
-      "\n",
-      "Estimated u probabilities using random sampling\n",
-      "\n",
-      "Your model is not yet fully trained. Missing estimates for:\n",
-      "    - company_name (no m values are trained).\n",
-      "    - street_address (no m values are trained).\n",
-      "    - zip_code (no m values are trained).\n",
-      "    - state (no m values are trained).\n",
-      "    - city (no m values are trained).\n"
+      "----- Estimating u probabilities using random sampling -----\n"
      ]
     }
    ],
    "source": [
-    "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)"
+    "linker.training.estimate_u_using_random_sampling(max_pairs=1e8)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 386,
+   "execution_count": 427,
    "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
    "metadata": {},
    "outputs": [
@@ -2381,31 +2417,30 @@
       "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n",
       "\n",
       "Parameter estimates will be made for the following comparison(s):\n",
+      "    - company_name_no_legal\n",
       "    - street_address\n",
-      "    - zip_code\n",
       "    - state\n",
       "    - city\n",
       "\n",
       "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
-      "    - company_name\n",
       "\n",
-      "Iteration 1: Largest change in params was 0.804 in the m_probability of street_address, level `All other comparisons`\n",
-      "Iteration 2: Largest change in params was 0.0737 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 3: Largest change in params was -0.039 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 4: Largest change in params was 0.021 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 5: Largest change in params was 0.00805 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 6: Largest change in params was -0.00338 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 7: Largest change in params was 0.00164 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 8: Largest change in params was 0.000825 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 9: Largest change in params was -0.000425 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 10: Largest change in params was -0.000223 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 11: Largest change in params was 0.000118 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 12: Largest change in params was 6.29e-05 in the m_probability of state, level `Exact match on state`\n",
+      "WARNING:\n",
+      "Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value\n",
+      "\n",
+      "WARNING:\n",
+      "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n",
       "\n",
-      "EM converged after 12 iterations\n",
+      "Iteration 1: Largest change in params was -0.347 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 2: Largest change in params was 0.307 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 3: Largest change in params was 0.0403 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 4: Largest change in params was 4.46e-05 in the m_probability of city, level `All other comparisons`\n",
+      "\n",
+      "EM converged after 4 iterations\n",
+      "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
+      "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n",
       "\n",
       "Your model is not yet fully trained. Missing estimates for:\n",
-      "    - company_name (no m values are trained).\n"
+      "    - company_name_no_legal (some m values are not trained).\n"
      ]
     }
    ],
@@ -2418,7 +2453,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 387,
+   "execution_count": 428,
    "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
    "metadata": {},
    "outputs": [
@@ -2433,32 +2468,28 @@
       "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n",
       "\n",
       "Parameter estimates will be made for the following comparison(s):\n",
-      "    - company_name\n",
-      "    - zip_code\n",
+      "    - company_name_no_legal\n",
       "    - state\n",
       "    - city\n",
       "\n",
       "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
       "    - street_address\n",
       "\n",
-      "Iteration 1: Largest change in params was -0.929 in the m_probability of company_name, level `Exact match on company_name`\n",
-      "Iteration 2: Largest change in params was 0.0355 in probability_two_random_records_match\n",
-      "Iteration 3: Largest change in params was 0.00843 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 4: Largest change in params was -0.00612 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 5: Largest change in params was -0.00431 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 6: Largest change in params was -0.00301 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 7: Largest change in params was 0.0021 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 8: Largest change in params was -0.00146 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 9: Largest change in params was 0.00101 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 10: Largest change in params was -0.000704 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 11: Largest change in params was 0.000489 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 12: Largest change in params was -0.00034 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 13: Largest change in params was -0.000236 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 14: Largest change in params was 0.000164 in the m_probability of state, level `All other comparisons`\n",
-      "Iteration 15: Largest change in params was -0.000114 in the m_probability of state, level `Exact match on state`\n",
-      "Iteration 16: Largest change in params was -7.88e-05 in the m_probability of state, level `Exact match on state`\n",
+      "Iteration 1: Largest change in params was -0.395 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 2: Largest change in params was 0.889 in the m_probability of company_name_no_legal, level `All other comparisons`\n",
+      "Iteration 3: Largest change in params was 0.285 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.0152 in probability_two_random_records_match\n",
+      "Iteration 5: Largest change in params was 0.048 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 6: Largest change in params was 0.0559 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 7: Largest change in params was 0.0205 in probability_two_random_records_match\n",
+      "Iteration 8: Largest change in params was 0.00696 in probability_two_random_records_match\n",
+      "Iteration 9: Largest change in params was 0.0024 in probability_two_random_records_match\n",
+      "Iteration 10: Largest change in params was 0.000849 in probability_two_random_records_match\n",
+      "Iteration 11: Largest change in params was 0.000305 in probability_two_random_records_match\n",
+      "Iteration 12: Largest change in params was 0.00011 in probability_two_random_records_match\n",
+      "Iteration 13: Largest change in params was 3.98e-05 in probability_two_random_records_match\n",
       "\n",
-      "EM converged after 16 iterations\n",
+      "EM converged after 13 iterations\n",
       "\n",
       "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
      ]
@@ -2473,7 +2504,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 388,
+   "execution_count": 429,
    "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580",
    "metadata": {},
    "outputs": [
@@ -2482,23 +2513,112 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed {\n",
+       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed details,\n",
+       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\");\n",
+       "    }\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@5.20.1?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@6?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-486b54dc4323abf2383382ed2927fd87\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-486b54dc4323abf2383382ed2927fd87\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 1.7766488754200009e-06, \"log2_bayes_factor\": -19.10240998404316, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  562,858.4 records.This is equivalent to a starting match weight of -19.102.\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"m_probability_description\": \"Amongst matching record comparisons, 100% of records (i.e. one in 1) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001146% of records (i.e. one in 872,346) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2225% of records (i.e. one in 449) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.656e-05% of records (i.e. one in 2,734,922) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9859795613823598, \"u_probability\": 0.9999984880242627, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.985981052161788, \"log2_bayes_factor\": -0.0203681726400408, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"m_probability_description\": \"Amongst matching record comparisons, 75.49% of records (i.e. one in 1.325) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 13.96% of records (i.e. one in 7.163) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.07843139336913232, \"u_probability\": 0.2336448598130841, \"m_probability_description\": \"Amongst matching record comparisons, 7.843% of records (i.e. one in 12.75) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 23.36% of records (i.e. one in 4.28) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.33568636361988635, \"log2_bayes_factor\": -1.5748141623724388, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 2.979 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"m_probability_description\": \"Amongst matching record comparisons, 16.67% of records (i.e. one in 6) are in the array subset comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 62.68% of records (i.e. one in 1.596) are in the array subset comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"m_probability_description\": \"Amongst matching record comparisons, 70.4% of records (i.e. one in 1.42) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.739% of records (i.e. one in 21.1) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2959745498553683, \"u_probability\": 0.9526111145925285, \"m_probability_description\": \"Amongst matching record comparisons, 29.6% of records (i.e. one in 3.379) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.26% of records (i.e. one in 1.05) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.3106981908162692, \"log2_bayes_factor\": -1.6864142541614218, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.219 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"m_probability_description\": \"Amongst matching record comparisons, 57.71% of records (i.e. one in 1.733) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6087% of records (i.e. one in 164) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.027155350274657646, \"u_probability\": 0.0004625318884682651, \"m_probability_description\": \"Amongst matching record comparisons, 2.716% of records (i.e. one in 36.83) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04625% of records (i.e. one in 2,162) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 58.710222909356894, \"log2_bayes_factor\": 5.875539829168419, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 58.71 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.39579349044407397, \"u_probability\": 0.9934509050534991, \"m_probability_description\": \"Amongst matching record comparisons, 39.58% of records (i.e. one in 2.527) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.35% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.39840266733941904, \"log2_bayes_factor\": -1.327700788484204, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.51 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.VConcatChart(...)"
+      ]
+     },
+     "execution_count": 429,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "linker.visualisations.match_weights_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 430,
+   "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed details,\n",
-       "  #altair-viz-683c397c94694591a2af8e121ffd957d.vega-embed details summary {\n",
+       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed details,\n",
+       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-683c397c94694591a2af8e121ffd957d\"></div>\n",
+       "<div id=\"altair-viz-502ff82b439845389349212cfd7a7eb0\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-683c397c94694591a2af8e121ffd957d\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-683c397c94694591a2af8e121ffd957d\");\n",
+       "    if (outputDiv.id !== \"altair-viz-502ff82b439845389349212cfd7a7eb0\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-502ff82b439845389349212cfd7a7eb0\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2529,41 +2649,1968 @@
        "      throw err;\n",
        "    }\n",
        "\n",
-       "    function displayChart(vegaEmbed) {\n",
-       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
-       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"5\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-98b48f38ee96425504d9a9c7a3e99480\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-98b48f38ee96425504d9a9c7a3e99480\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"m_probability_description\": \"Amongst matching record comparisons, 100% of records (i.e. one in 1) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001146% of records (i.e. one in 872,346) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2225% of records (i.e. one in 449) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.656e-05% of records (i.e. one in 2,734,922) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9859795613823598, \"u_probability\": 0.9999984880242627, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.985981052161788, \"log2_bayes_factor\": -0.0203681726400408, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"m_probability_description\": \"Amongst matching record comparisons, 75.49% of records (i.e. one in 1.325) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 13.96% of records (i.e. one in 7.163) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.07843139336913232, \"u_probability\": 0.2336448598130841, \"m_probability_description\": \"Amongst matching record comparisons, 7.843% of records (i.e. one in 12.75) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 23.36% of records (i.e. one in 4.28) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.33568636361988635, \"log2_bayes_factor\": -1.5748141623724388, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 2.979 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"m_probability_description\": \"Amongst matching record comparisons, 16.67% of records (i.e. one in 6) are in the array subset comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 62.68% of records (i.e. one in 1.596) are in the array subset comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"m_probability_description\": \"Amongst matching record comparisons, 70.4% of records (i.e. one in 1.42) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.739% of records (i.e. one in 21.1) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2959745498553683, \"u_probability\": 0.9526111145925285, \"m_probability_description\": \"Amongst matching record comparisons, 29.6% of records (i.e. one in 3.379) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.26% of records (i.e. one in 1.05) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.3106981908162692, \"log2_bayes_factor\": -1.6864142541614218, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.219 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"m_probability_description\": \"Amongst matching record comparisons, 57.71% of records (i.e. one in 1.733) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6087% of records (i.e. one in 164) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.027155350274657646, \"u_probability\": 0.0004625318884682651, \"m_probability_description\": \"Amongst matching record comparisons, 2.716% of records (i.e. one in 36.83) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04625% of records (i.e. one in 2,162) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 58.710222909356894, \"log2_bayes_factor\": 5.875539829168419, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 58.71 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.39579349044407397, \"u_probability\": 0.9934509050534991, \"m_probability_description\": \"Amongst matching record comparisons, 39.58% of records (i.e. one in 2.527) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.35% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.39840266733941904, \"log2_bayes_factor\": -1.327700788484204, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.51 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.HConcatChart(...)"
+      ]
+     },
+     "execution_count": 430,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "linker.visualisations.m_u_parameters_chart()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 420,
+   "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "settings = linker.misc.save_model_to_json(\n",
+    "    \"model_unsupervised_city_state_0.json\", overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b",
+   "metadata": {},
+   "source": [
+    "## Make Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 431,
+   "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3ce1c0af73694400974ca6253619dd5b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Blocking time: 9.73 seconds\n",
+      "Predict time: 0.52 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# it's helpful to keep threshold at .5 just to see what makes it into blocking\n",
+    "# df_predictions = linker.inference.predict(threshold_match_probability=0.5)\n",
+    "df_predictions = linker.inference.predict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 432,
+   "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_df = df_predictions.as_pandas_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 433,
+   "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_no_legal_l</th>\n",
+       "      <th>company_name_no_legal_r</th>\n",
+       "      <th>gamma_company_name_no_legal</th>\n",
+       "      <th>tf_company_name_no_legal_l</th>\n",
+       "      <th>tf_company_name_no_legal_r</th>\n",
+       "      <th>bf_company_name_no_legal</th>\n",
+       "      <th>bf_tf_adj_company_name_no_legal</th>\n",
+       "      <th>street_address_l</th>\n",
+       "      <th>street_address_r</th>\n",
+       "      <th>street_address_list_l</th>\n",
+       "      <th>street_address_list_r</th>\n",
+       "      <th>gamma_street_address</th>\n",
+       "      <th>bf_street_address</th>\n",
+       "      <th>state_l</th>\n",
+       "      <th>state_r</th>\n",
+       "      <th>gamma_state</th>\n",
+       "      <th>tf_state_l</th>\n",
+       "      <th>tf_state_r</th>\n",
+       "      <th>bf_state</th>\n",
+       "      <th>bf_tf_adj_state</th>\n",
+       "      <th>city_l</th>\n",
+       "      <th>city_r</th>\n",
+       "      <th>gamma_city</th>\n",
+       "      <th>tf_city_l</th>\n",
+       "      <th>tf_city_r</th>\n",
+       "      <th>bf_city</th>\n",
+       "      <th>bf_tf_adj_city</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "      <th>match_key</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>32260</th>\n",
+       "      <td>-24.047823</td>\n",
+       "      <td>5.766122e-08</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>82087</td>\n",
+       "      <td>113663</td>\n",
+       "      <td>sutro biopharma</td>\n",
+       "      <td>stirling energy systems solar one</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>310 utah ave., suite 150</td>\n",
+       "      <td>suite 150</td>\n",
+       "      <td>[310, utah, ave.,, suite, 150]</td>\n",
+       "      <td>[suite, 150]</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.265921</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>az</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.012950</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>south san francisco</td>\n",
+       "      <td>phoenix</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001438</td>\n",
+       "      <td>0.003511</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>STR BFRM</td>\n",
+       "      <td>STRLNK ENRJ SSTMS SLR ON</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27875</th>\n",
+       "      <td>-24.047823</td>\n",
+       "      <td>5.766122e-08</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>126035</td>\n",
+       "      <td>113797</td>\n",
+       "      <td>corner growth acquisition 2</td>\n",
+       "      <td>grubb and ellis management services</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>251 lytton avenue, suite 200</td>\n",
+       "      <td>suite 200</td>\n",
+       "      <td>[251, lytton, avenue,, suite, 200]</td>\n",
+       "      <td>[suite, 200]</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.265921</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.030197</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>palo alto</td>\n",
+       "      <td>pittsburgh</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001850</td>\n",
+       "      <td>0.003656</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KRNR KR0 AKKSXN</td>\n",
+       "      <td>KRB ANT ELS MNJMNT SRFSS</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27993</th>\n",
+       "      <td>-24.047823</td>\n",
+       "      <td>5.766122e-08</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>125096</td>\n",
+       "      <td>97905</td>\n",
+       "      <td>altus power</td>\n",
+       "      <td>allegheny ridge wind farm</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2200 atlantic street, 6th floor</td>\n",
+       "      <td>6th floor</td>\n",
+       "      <td>[2200, atlantic, street,, 6th, floor]</td>\n",
+       "      <td>[6th, floor]</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.265921</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.020325</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>san francisco</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.003789</td>\n",
+       "      <td>0.013374</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>ALTS PWR</td>\n",
+       "      <td>ALKHN RJ WNT FRM</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28003</th>\n",
+       "      <td>-24.047823</td>\n",
+       "      <td>5.766122e-08</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>115402</td>\n",
+       "      <td>91508</td>\n",
+       "      <td>clearway energy</td>\n",
+       "      <td>clipper windpower</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>300 carnegie center, suite 300</td>\n",
+       "      <td>suite 300</td>\n",
+       "      <td>[300, carnegie, center,, suite, 300]</td>\n",
+       "      <td>[suite, 300]</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.265921</td>\n",
+       "      <td>nj</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.031159</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>princeton</td>\n",
+       "      <td>carpinteria</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002118</td>\n",
+       "      <td>0.000189</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KLRW ENRJ</td>\n",
+       "      <td>KLPR WNTPWR</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28024</th>\n",
+       "      <td>-24.047823</td>\n",
+       "      <td>5.766122e-08</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>125009</td>\n",
+       "      <td>77758</td>\n",
+       "      <td>benchmark 2020 b21 mortgage trust</td>\n",
+       "      <td>bountiful city city of</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000048</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>200 west street</td>\n",
+       "      <td>198 south 200 west street</td>\n",
+       "      <td>[200, west, street]</td>\n",
+       "      <td>[198, south, 200, west, street]</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.265921</td>\n",
+       "      <td>ny</td>\n",
+       "      <td>ut</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.113010</td>\n",
+       "      <td>0.010475</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>bountiful city</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.086944</td>\n",
+       "      <td>0.000022</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>BNXMRK B MRTKJ TRST</td>\n",
+       "      <td>BNTFL ST ST OF</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1038434</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>137784</td>\n",
+       "      <td>70294</td>\n",
+       "      <td>farmer brothers</td>\n",
+       "      <td>farmers electric ia</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>20333 s normandie ave</td>\n",
+       "      <td>1959 yoder ave,sw</td>\n",
+       "      <td>[20333, s, normandie, ave]</td>\n",
+       "      <td>[1959, yoder, ave,sw]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>ia</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.016527</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>torrance</td>\n",
+       "      <td>kalona</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002485</td>\n",
+       "      <td>0.000011</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>FRMR BR0RS</td>\n",
+       "      <td>FRMRS ELKTRK I</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1038441</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>139631</td>\n",
+       "      <td>137540</td>\n",
+       "      <td>international game technology</td>\n",
+       "      <td>intergen north america</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000048</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>6355 south buffalo drive</td>\n",
+       "      <td>4th floor</td>\n",
+       "      <td>[6355, south, buffalo, drive]</td>\n",
+       "      <td>[4th, floor]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>nv</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.019288</td>\n",
+       "      <td>0.041401</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>las vegas</td>\n",
+       "      <td>burlington</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.010477</td>\n",
+       "      <td>0.001415</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>INTRNXNL KM TXNLJ</td>\n",
+       "      <td>INTRJN NR0 AMRK</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1038443</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>90853</td>\n",
+       "      <td>13424</td>\n",
+       "      <td>monster arts</td>\n",
+       "      <td>minnesota solar csg 4</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>806 east avenida pico</td>\n",
+       "      <td>200 wellington street west, su</td>\n",
+       "      <td>[806, east, avenida, pico]</td>\n",
+       "      <td>[200, wellington, street, west,, su]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>None</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>san clemente</td>\n",
+       "      <td>toronto</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000346</td>\n",
+       "      <td>0.002129</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>MNSTR ARTS</td>\n",
+       "      <td>MNST SLR KSK</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1038454</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>108136</td>\n",
+       "      <td>1959</td>\n",
+       "      <td>nxt id</td>\n",
+       "      <td>nextgrid mastic</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>4 research drive, #402</td>\n",
+       "      <td>879 sanchez street</td>\n",
+       "      <td>[4, research, drive,, #402]</td>\n",
+       "      <td>[879, sanchez, street]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.020325</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>shelton</td>\n",
+       "      <td>san francisco</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000390</td>\n",
+       "      <td>0.013374</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NKST IT</td>\n",
+       "      <td>NKSTKRT MSTK</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1038456</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>91657</td>\n",
+       "      <td>105602</td>\n",
+       "      <td>coronado biosciences</td>\n",
+       "      <td>garnet energy</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.985981</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>24 new england executive park</td>\n",
+       "      <td>suite 102</td>\n",
+       "      <td>[24, new, england, executive, park]</td>\n",
+       "      <td>[suite, 102]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.041401</td>\n",
+       "      <td>0.149142</td>\n",
+       "      <td>0.310698</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>burlington</td>\n",
+       "      <td>westlake village</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.001415</td>\n",
+       "      <td>0.000691</td>\n",
+       "      <td>0.398403</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>KRNT BSSNSS</td>\n",
+       "      <td>KRNT ENRJ</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1038457 rows × 36 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r            company_name_no_legal_l              company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal                 street_address_l                street_address_r                  street_address_list_l                 street_address_list_r  gamma_street_address  bf_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r  bf_state  bf_tf_adj_state               city_l            city_r  gamma_city  tf_city_l  tf_city_r   bf_city  bf_tf_adj_city company_name_mphone_l     company_name_mphone_r match_key\n",
+       "32260      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1        82087       113663                    sutro biopharma    stirling energy systems solar one                            0                    0.000019                    0.000029                  0.985981                              1.0         310 utah ave., suite 150                       suite 150         [310, utah, ave.,, suite, 150]                          [suite, 150]                   0.0           0.265921      ca      az            0    0.149142    0.012950  0.310698              1.0  south san francisco           phoenix           0   0.001438   0.003511  0.398403             1.0              STR BFRM  STRLNK ENRJ SSTMS SLR ON         3\n",
+       "27875      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       126035       113797        corner growth acquisition 2  grubb and ellis management services                            0                    0.000010                    0.000019                  0.985981                              1.0     251 lytton avenue, suite 200                       suite 200     [251, lytton, avenue,, suite, 200]                          [suite, 200]                   0.0           0.265921      ca      pa            0    0.149142    0.030197  0.310698              1.0            palo alto        pittsburgh           0   0.001850   0.003656  0.398403             1.0      KRNR KR0 AKKSXN   KRB ANT ELS MNJMNT SRFSS         3\n",
+       "27993      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       125096        97905                        altus power            allegheny ridge wind farm                            0                    0.000010                    0.000038                  0.985981                              1.0  2200 atlantic street, 6th floor                       6th floor  [2200, atlantic, street,, 6th, floor]                          [6th, floor]                   0.0           0.265921      ct      ca            0    0.020325    0.149142  0.310698              1.0             stamford     san francisco           0   0.003789   0.013374  0.398403             1.0              ALTS PWR          ALKHN RJ WNT FRM         3\n",
+       "28003      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       115402        91508                    clearway energy                    clipper windpower                            0                    0.000038                    0.000029                  0.985981                              1.0   300 carnegie center, suite 300                       suite 300   [300, carnegie, center,, suite, 300]                          [suite, 300]                   0.0           0.265921      nj      ca            0    0.031159    0.149142  0.310698              1.0            princeton       carpinteria           0   0.002118   0.000189  0.398403             1.0             KLRW ENRJ               KLPR WNTPWR         3\n",
+       "28024      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       125009        77758  benchmark 2020 b21 mortgage trust               bountiful city city of                            0                    0.000010                    0.000048                  0.985981                              1.0                  200 west street       198 south 200 west street                    [200, west, street]       [198, south, 200, west, street]                   0.0           0.265921      ny      ut            0    0.113010    0.010475  0.310698              1.0             new york    bountiful city           0   0.086944   0.000022  0.398403             1.0   BNXMRK B MRTKJ TRST            BNTFL ST ST OF         3\n",
+       "...               ...                ...                      ...                      ...          ...          ...                                ...                                  ...                          ...                         ...                         ...                       ...                              ...                              ...                             ...                                    ...                                   ...                   ...                ...     ...     ...          ...         ...         ...       ...              ...                  ...               ...         ...        ...        ...       ...             ...                   ...                       ...       ...\n",
+       "1038434           NaN                NaN  __splink__input_table_0  __splink__input_table_1       137784        70294                    farmer brothers                  farmers electric ia                            0                    0.000029                    0.000038                  0.985981                              1.0            20333 s normandie ave               1959 yoder ave,sw             [20333, s, normandie, ave]                 [1959, yoder, ave,sw]                   NaN                NaN      ca      ia            0    0.149142    0.016527  0.310698              1.0             torrance            kalona           0   0.002485   0.000011  0.398403             1.0            FRMR BR0RS            FRMRS ELKTRK I         0\n",
+       "1038441           NaN                NaN  __splink__input_table_0  __splink__input_table_1       139631       137540      international game technology               intergen north america                            0                    0.000048                    0.000029                  0.985981                              1.0         6355 south buffalo drive                       4th floor          [6355, south, buffalo, drive]                          [4th, floor]                   NaN                NaN      nv      ma            0    0.019288    0.041401  0.310698              1.0            las vegas        burlington           0   0.010477   0.001415  0.398403             1.0     INTRNXNL KM TXNLJ           INTRJN NR0 AMRK         0\n",
+       "1038443           NaN                NaN  __splink__input_table_0  __splink__input_table_1        90853        13424                       monster arts                minnesota solar csg 4                            0                    0.000010                    0.000029                  0.985981                              1.0            806 east avenida pico  200 wellington street west, su             [806, east, avenida, pico]  [200, wellington, street, west,, su]                   NaN                NaN      ca    None           -1    0.149142         NaN  1.000000              1.0         san clemente           toronto           0   0.000346   0.002129  0.398403             1.0            MNSTR ARTS             MNST SLR KSK          0\n",
+       "1038454           NaN                NaN  __splink__input_table_0  __splink__input_table_1       108136         1959                             nxt id                      nextgrid mastic                            0                    0.000038                    0.000029                  0.985981                              1.0           4 research drive, #402              879 sanchez street            [4, research, drive,, #402]                [879, sanchez, street]                   NaN                NaN      ct      ca            0    0.020325    0.149142  0.310698              1.0              shelton     san francisco           0   0.000390   0.013374  0.398403             1.0               NKST IT              NKSTKRT MSTK         0\n",
+       "1038456           NaN                NaN  __splink__input_table_0  __splink__input_table_1        91657       105602               coronado biosciences                        garnet energy                            0                    0.000019                    0.000038                  0.985981                              1.0    24 new england executive park                       suite 102    [24, new, england, executive, park]                          [suite, 102]                   NaN                NaN      ma      ca            0    0.041401    0.149142  0.310698              1.0           burlington  westlake village           0   0.001415   0.000691  0.398403             1.0           KRNT BSSNSS                 KRNT ENRJ         0\n",
+       "\n",
+       "[1038457 rows x 36 columns]"
+      ]
+     },
+     "execution_count": 433,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_df.sort_values(by=\"match_probability\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 434,
+   "id": "c0b292c8-26ed-407a-866e-75851577d567",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# join on utility_id_eia and CIK\n",
+    "preds_validation_df = preds_df.merge(sec_clean_df[[\"record_id\", \"central_index_key\", \"company_name_raw\"]],\n",
+    "                                     how=\"left\",\n",
+    "                                     left_on=\"record_id_l\",\n",
+    "                                     right_on=\"record_id\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 435,
+   "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_validation_df = preds_validation_df.merge(eia_clean_df[[\"record_id\", \"utility_id_eia\"]],\n",
+    "                                                how=\"left\",\n",
+    "                                                left_on=\"record_id_r\",\n",
+    "                                                right_on=\"record_id\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 436,
+   "id": "5103190c-3775-427f-a8f2-cc8a8f79892b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preds_validation_df = preds_validation_df.sort_values(\n",
+    "    by=[\"central_index_key\", \"utility_id_eia\", \"match_probability\"], ascending=False\n",
+    ").drop_duplicates(subset=[\"central_index_key\", \"utility_id_eia\"], keep=\"first\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 437,
+   "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_no_legal_l</th>\n",
+       "      <th>company_name_no_legal_r</th>\n",
+       "      <th>gamma_company_name_no_legal</th>\n",
+       "      <th>tf_company_name_no_legal_l</th>\n",
+       "      <th>tf_company_name_no_legal_r</th>\n",
+       "      <th>bf_company_name_no_legal</th>\n",
+       "      <th>bf_tf_adj_company_name_no_legal</th>\n",
+       "      <th>street_address_l</th>\n",
+       "      <th>street_address_r</th>\n",
+       "      <th>street_address_list_l</th>\n",
+       "      <th>street_address_list_r</th>\n",
+       "      <th>gamma_street_address</th>\n",
+       "      <th>bf_street_address</th>\n",
+       "      <th>state_l</th>\n",
+       "      <th>state_r</th>\n",
+       "      <th>gamma_state</th>\n",
+       "      <th>tf_state_l</th>\n",
+       "      <th>tf_state_r</th>\n",
+       "      <th>bf_state</th>\n",
+       "      <th>bf_tf_adj_state</th>\n",
+       "      <th>city_l</th>\n",
+       "      <th>city_r</th>\n",
+       "      <th>gamma_city</th>\n",
+       "      <th>tf_city_l</th>\n",
+       "      <th>tf_city_r</th>\n",
+       "      <th>bf_city</th>\n",
+       "      <th>bf_tf_adj_city</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "      <th>match_key</th>\n",
+       "      <th>record_id_x</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>company_name_raw</th>\n",
+       "      <th>record_id_y</th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>889845</th>\n",
+       "      <td>5.679807</td>\n",
+       "      <td>0.980865</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>51956</td>\n",
+       "      <td>22658</td>\n",
+       "      <td>constellation energy</td>\n",
+       "      <td>constellation newenergy</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.000077</td>\n",
+       "      <td>6085.754919</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1310 point street</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[1310, point, street]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>md</td>\n",
+       "      <td>md</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.023298</td>\n",
+       "      <td>0.023298</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>2.034020</td>\n",
+       "      <td>baltimore</td>\n",
+       "      <td>baltimore</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.003678</td>\n",
+       "      <td>0.003678</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>1.654881</td>\n",
+       "      <td>KNSTLXN ENRJ</td>\n",
+       "      <td>KNSTLXN NWNRJ</td>\n",
+       "      <td>0</td>\n",
+       "      <td>51956</td>\n",
+       "      <td>0001868275</td>\n",
+       "      <td>constellation energy corp</td>\n",
+       "      <td>22658</td>\n",
+       "      <td>58491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>884109</th>\n",
+       "      <td>13.095633</td>\n",
+       "      <td>0.999886</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>120267</td>\n",
+       "      <td>96849</td>\n",
+       "      <td>evergy</td>\n",
+       "      <td>evergy</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.059564</td>\n",
+       "      <td>1200 main street</td>\n",
+       "      <td>1200 main street</td>\n",
+       "      <td>[1200, main, street]</td>\n",
+       "      <td>[1200, main, street]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.011744</td>\n",
+       "      <td>0.011744</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>4.035057</td>\n",
+       "      <td>kansas city</td>\n",
+       "      <td>kansas city</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001973</td>\n",
+       "      <td>0.001973</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>3.085372</td>\n",
+       "      <td>EFRJ</td>\n",
+       "      <td>EFRJ</td>\n",
+       "      <td>0</td>\n",
+       "      <td>120267</td>\n",
+       "      <td>0001711269</td>\n",
+       "      <td>evergy, inc.</td>\n",
+       "      <td>96849</td>\n",
+       "      <td>64428</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>893941</th>\n",
+       "      <td>12.486567</td>\n",
+       "      <td>0.999826</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>120222</td>\n",
+       "      <td>96211</td>\n",
+       "      <td>consol energy</td>\n",
+       "      <td>consol energy</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.019855</td>\n",
+       "      <td>275 technology drive</td>\n",
+       "      <td>275 technology drive</td>\n",
+       "      <td>[275, technology, drive]</td>\n",
+       "      <td>[275, technology, drive]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.030197</td>\n",
+       "      <td>0.030197</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>1.569346</td>\n",
+       "      <td>canonsburg</td>\n",
+       "      <td>canonsburg</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000390</td>\n",
+       "      <td>0.000390</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>15.603165</td>\n",
+       "      <td>KNSL ENRJ</td>\n",
+       "      <td>KNSL ENRJ</td>\n",
+       "      <td>0</td>\n",
+       "      <td>120222</td>\n",
+       "      <td>0001710366</td>\n",
+       "      <td>consol energy inc.</td>\n",
+       "      <td>96211</td>\n",
+       "      <td>4299</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>943594</th>\n",
+       "      <td>9.161274</td>\n",
+       "      <td>0.998256</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>119271</td>\n",
+       "      <td>83669</td>\n",
+       "      <td>vistra energy</td>\n",
+       "      <td>vistra energy</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.059564</td>\n",
+       "      <td>6555 sierra drive</td>\n",
+       "      <td>6555 sierra drive</td>\n",
+       "      <td>[6555, sierra, drive]</td>\n",
+       "      <td>[6555, sierra, drive]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.080866</td>\n",
+       "      <td>0.080866</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>0.586015</td>\n",
+       "      <td>irving</td>\n",
+       "      <td>irving</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.004380</td>\n",
+       "      <td>0.004380</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>1.389595</td>\n",
+       "      <td>FSTR ENRJ</td>\n",
+       "      <td>FSTR ENRJ</td>\n",
+       "      <td>0</td>\n",
+       "      <td>119271</td>\n",
+       "      <td>0001692819</td>\n",
+       "      <td>vistra energy corp.</td>\n",
+       "      <td>83669</td>\n",
+       "      <td>62723</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>860414</th>\n",
+       "      <td>7.576311</td>\n",
+       "      <td>0.994788</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>119274</td>\n",
+       "      <td>71441</td>\n",
+       "      <td>vistra</td>\n",
+       "      <td>vistra</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.019855</td>\n",
+       "      <td>6555 sierra drive</td>\n",
+       "      <td>6555 sierra drive</td>\n",
+       "      <td>[6555, sierra, drive]</td>\n",
+       "      <td>[6555, sierra, drive]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.080866</td>\n",
+       "      <td>0.080866</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>0.586015</td>\n",
+       "      <td>irving</td>\n",
+       "      <td>irving</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.004380</td>\n",
+       "      <td>0.004380</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>1.389595</td>\n",
+       "      <td>FSTR</td>\n",
+       "      <td>FSTR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>119274</td>\n",
+       "      <td>0001692819</td>\n",
+       "      <td>vistra corp.</td>\n",
+       "      <td>71441</td>\n",
+       "      <td>5504</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1026765</th>\n",
+       "      <td>12.087133</td>\n",
+       "      <td>0.999770</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>153106</td>\n",
+       "      <td>79761</td>\n",
+       "      <td>archer daniels midland</td>\n",
+       "      <td>archer daniels midland</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.019855</td>\n",
+       "      <td>4666 faries pkwy</td>\n",
+       "      <td>4666 faries pkwy</td>\n",
+       "      <td>[4666, faries, pkwy]</td>\n",
+       "      <td>[4666, faries, pkwy]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>il</td>\n",
+       "      <td>il</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.033191</td>\n",
+       "      <td>0.033191</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>1.427770</td>\n",
+       "      <td>decatur</td>\n",
+       "      <td>decatur</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000468</td>\n",
+       "      <td>0.000468</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>13.002638</td>\n",
+       "      <td>ARXR TNLS MTLNT</td>\n",
+       "      <td>ARXR TNLS MTLNT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>153106</td>\n",
+       "      <td>0000007084</td>\n",
+       "      <td>archer daniels midland co</td>\n",
+       "      <td>79761</td>\n",
+       "      <td>772</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>656833</th>\n",
+       "      <td>9.809977</td>\n",
+       "      <td>0.998887</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>150546</td>\n",
+       "      <td>79913</td>\n",
+       "      <td>appalachian power</td>\n",
+       "      <td>appalachian power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000077</td>\n",
+       "      <td>0.000077</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.014891</td>\n",
+       "      <td>1 riverside plaza</td>\n",
+       "      <td>1 riverside plaza</td>\n",
+       "      <td>[1, riverside, plaza]</td>\n",
+       "      <td>[1, riverside, plaza]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>oh</td>\n",
+       "      <td>oh</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.018770</td>\n",
+       "      <td>0.018770</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>2.524754</td>\n",
+       "      <td>columbus</td>\n",
+       "      <td>columbus</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.003009</td>\n",
+       "      <td>0.003009</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>2.022633</td>\n",
+       "      <td>APLXN PWR</td>\n",
+       "      <td>APLXN PWR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>150546</td>\n",
+       "      <td>0000006879</td>\n",
+       "      <td>appalachian power co</td>\n",
+       "      <td>79913</td>\n",
+       "      <td>733</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>640747</th>\n",
+       "      <td>10.888046</td>\n",
+       "      <td>0.999473</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>144743</td>\n",
+       "      <td>80319</td>\n",
+       "      <td>american crystal sugar /mn/</td>\n",
+       "      <td>american crystal sugar</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>6085.754919</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>101 n 3rd st</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[101, n, 3rd, st]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>mn</td>\n",
+       "      <td>mn</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.025996</td>\n",
+       "      <td>0.025996</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>1.822919</td>\n",
+       "      <td>moorhead</td>\n",
+       "      <td>moorhead</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000089</td>\n",
+       "      <td>0.000089</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>68.263848</td>\n",
+       "      <td>AMRKN KRSTL SKR MN</td>\n",
+       "      <td>AMRKN KRSTL SKR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>144743</td>\n",
+       "      <td>0000004828</td>\n",
+       "      <td>american crystal sugar co /mn/</td>\n",
+       "      <td>80319</td>\n",
+       "      <td>491</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>998578</th>\n",
+       "      <td>9.990554</td>\n",
+       "      <td>0.999018</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>2575</td>\n",
+       "      <td>80977</td>\n",
+       "      <td>alabama power</td>\n",
+       "      <td>alabama power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000067</td>\n",
+       "      <td>0.000067</td>\n",
+       "      <td>872345.689655</td>\n",
+       "      <td>0.017018</td>\n",
+       "      <td>600 n 18th st</td>\n",
+       "      <td>None</td>\n",
+       "      <td>[600, n, 18th, st]</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>al</td>\n",
+       "      <td>al</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.005280</td>\n",
+       "      <td>0.005280</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>8.975778</td>\n",
+       "      <td>birmingham</td>\n",
+       "      <td>birmingham</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001995</td>\n",
+       "      <td>0.001995</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>3.050898</td>\n",
+       "      <td>ALBM PWR</td>\n",
+       "      <td>ALBM PWR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>2575</td>\n",
+       "      <td>0000003153</td>\n",
+       "      <td>alabama power co</td>\n",
+       "      <td>80977</td>\n",
+       "      <td>195</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>912914</th>\n",
+       "      <td>9.434494</td>\n",
+       "      <td>0.998557</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>132976</td>\n",
+       "      <td>79317</td>\n",
+       "      <td>air products and chemicals /de/</td>\n",
+       "      <td>air products and chemicals</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000048</td>\n",
+       "      <td>6085.754919</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
+       "      <td>[7201, hamilton, blvd]</td>\n",
+       "      <td>[7201, hamilton, blvd]</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.407499</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.030197</td>\n",
+       "      <td>0.030197</td>\n",
+       "      <td>14.856341</td>\n",
+       "      <td>1.569346</td>\n",
+       "      <td>allentown</td>\n",
+       "      <td>allentown</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001137</td>\n",
+       "      <td>0.001137</td>\n",
+       "      <td>94.80739</td>\n",
+       "      <td>5.354027</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS</td>\n",
+       "      <td>0</td>\n",
+       "      <td>132976</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>air products &amp; chemicals inc /de/</td>\n",
+       "      <td>79317</td>\n",
+       "      <td>991</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>197 rows × 41 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r          company_name_no_legal_l     company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal      street_address_l      street_address_r     street_address_list_l     street_address_list_r  gamma_street_address  bf_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r   bf_city  bf_tf_adj_city  company_name_mphone_l company_name_mphone_r match_key  record_id_x central_index_key                   company_name_raw  record_id_y  utility_id_eia\n",
+       "889845       5.679807           0.980865  __splink__input_table_0  __splink__input_table_1        51956        22658             constellation energy     constellation newenergy                            1                    0.000029                    0.000077               6085.754919                         1.000000     1310 point street                  None     [1310, point, street]                       NaN                  -1.0           1.000000      md      md            1    0.023298    0.023298  14.856341         2.034020    baltimore    baltimore           2   0.003678   0.003678  94.80739        1.654881           KNSTLXN ENRJ         KNSTLXN NWNRJ         0        51956        0001868275          constellation energy corp        22658           58491\n",
+       "884109      13.095633           0.999886  __splink__input_table_0  __splink__input_table_1       120267        96849                           evergy                      evergy                            2                    0.000019                    0.000019             872345.689655                         0.059564      1200 main street      1200 main street      [1200, main, street]      [1200, main, street]                   2.0           5.407499      mo      mo            1    0.011744    0.011744  14.856341         4.035057  kansas city  kansas city           2   0.001973   0.001973  94.80739        3.085372                   EFRJ                  EFRJ         0       120267        0001711269                       evergy, inc.        96849           64428\n",
+       "893941      12.486567           0.999826  __splink__input_table_0  __splink__input_table_1       120222        96211                    consol energy               consol energy                            2                    0.000058                    0.000058             872345.689655                         0.019855  275 technology drive  275 technology drive  [275, technology, drive]  [275, technology, drive]                   2.0           5.407499      pa      pa            1    0.030197    0.030197  14.856341         1.569346   canonsburg   canonsburg           2   0.000390   0.000390  94.80739       15.603165              KNSL ENRJ             KNSL ENRJ         0       120222        0001710366                 consol energy inc.        96211            4299\n",
+       "943594       9.161274           0.998256  __splink__input_table_0  __splink__input_table_1       119271        83669                    vistra energy               vistra energy                            2                    0.000019                    0.000019             872345.689655                         0.059564     6555 sierra drive     6555 sierra drive     [6555, sierra, drive]     [6555, sierra, drive]                   2.0           5.407499      tx      tx            1    0.080866    0.080866  14.856341         0.586015       irving       irving           2   0.004380   0.004380  94.80739        1.389595              FSTR ENRJ             FSTR ENRJ         0       119271        0001692819                vistra energy corp.        83669           62723\n",
+       "860414       7.576311           0.994788  __splink__input_table_0  __splink__input_table_1       119274        71441                           vistra                      vistra                            2                    0.000058                    0.000058             872345.689655                         0.019855     6555 sierra drive     6555 sierra drive     [6555, sierra, drive]     [6555, sierra, drive]                   2.0           5.407499      tx      tx            1    0.080866    0.080866  14.856341         0.586015       irving       irving           2   0.004380   0.004380  94.80739        1.389595                   FSTR                  FSTR         0       119274        0001692819                       vistra corp.        71441            5504\n",
+       "...               ...                ...                      ...                      ...          ...          ...                              ...                         ...                          ...                         ...                         ...                       ...                              ...                   ...                   ...                       ...                       ...                   ...                ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...       ...             ...                    ...                   ...       ...          ...               ...                                ...          ...             ...\n",
+       "1026765     12.087133           0.999770  __splink__input_table_0  __splink__input_table_1       153106        79761           archer daniels midland      archer daniels midland                            2                    0.000058                    0.000058             872345.689655                         0.019855      4666 faries pkwy      4666 faries pkwy      [4666, faries, pkwy]      [4666, faries, pkwy]                   2.0           5.407499      il      il            1    0.033191    0.033191  14.856341         1.427770      decatur      decatur           2   0.000468   0.000468  94.80739       13.002638        ARXR TNLS MTLNT       ARXR TNLS MTLNT         0       153106        0000007084          archer daniels midland co        79761             772\n",
+       "656833       9.809977           0.998887  __splink__input_table_0  __splink__input_table_1       150546        79913                appalachian power           appalachian power                            2                    0.000077                    0.000077             872345.689655                         0.014891     1 riverside plaza     1 riverside plaza     [1, riverside, plaza]     [1, riverside, plaza]                   2.0           5.407499      oh      oh            1    0.018770    0.018770  14.856341         2.524754     columbus     columbus           2   0.003009   0.003009  94.80739        2.022633              APLXN PWR             APLXN PWR         0       150546        0000006879               appalachian power co        79913             733\n",
+       "640747      10.888046           0.999473  __splink__input_table_0  __splink__input_table_1       144743        80319      american crystal sugar /mn/      american crystal sugar                            1                    0.000010                    0.000029               6085.754919                         1.000000          101 n 3rd st                  None         [101, n, 3rd, st]                       NaN                  -1.0           1.000000      mn      mn            1    0.025996    0.025996  14.856341         1.822919     moorhead     moorhead           2   0.000089   0.000089  94.80739       68.263848     AMRKN KRSTL SKR MN       AMRKN KRSTL SKR         0       144743        0000004828     american crystal sugar co /mn/        80319             491\n",
+       "998578       9.990554           0.999018  __splink__input_table_0  __splink__input_table_1         2575        80977                    alabama power               alabama power                            2                    0.000067                    0.000067             872345.689655                         0.017018         600 n 18th st                  None        [600, n, 18th, st]                       NaN                  -1.0           1.000000      al      al            1    0.005280    0.005280  14.856341         8.975778   birmingham   birmingham           2   0.001995   0.001995  94.80739        3.050898               ALBM PWR              ALBM PWR         0         2575        0000003153                   alabama power co        80977             195\n",
+       "912914       9.434494           0.998557  __splink__input_table_0  __splink__input_table_1       132976        79317  air products and chemicals /de/  air products and chemicals                            1                    0.000019                    0.000048               6085.754919                         1.000000    7201 hamilton blvd    7201 hamilton blvd    [7201, hamilton, blvd]    [7201, hamilton, blvd]                   2.0           5.407499      pa      pa            1    0.030197    0.030197  14.856341         1.569346    allentown    allentown           2   0.001137   0.001137  94.80739        5.354027  AR PRTKTS ANT XMKLS T   AR PRTKTS ANT XMKLS         0       132976        0000002969  air products & chemicals inc /de/        79317             991\n",
+       "\n",
+       "[197 rows x 41 columns]"
+      ]
+     },
+     "execution_count": 437,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "preds_validation_df[preds_validation_df.match_probability > .9]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "11190456-12a9-49df-b863-7a6f674e39eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_df = pd.read_csv(\"sec_eia_validation_set.csv\", dtype={\"central_index_key\": str})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_df[\"central_index_key\"] = validation_df[\"central_index_key\"].str.zfill(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 438,
+   "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_df = validation_df.merge(\n",
+    "    preds_validation_df[[\"record_id_l\", \"record_id_r\", \"central_index_key\", \"utility_id_eia\", \"match_probability\", \"gamma_company_name_no_legal\"]].drop_duplicates(keep=\"first\"),\n",
+    "    how=\"left\",\n",
+    "    on=[\"central_index_key\", \"utility_id_eia\"],\n",
+    "    indicator=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 439,
+   "id": "4d45f339-7a5b-466a-81f5-c71e425a77df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_df[\"predicted_match\"] = merged_df[\"_merge\"].map({\"both\": 1, \"left_only\": 0})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 440,
+   "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_df[\"predicted_match\"] = merged_df[\"predicted_match\"].where(\n",
+    "    (merged_df.match_probability > .95),\n",
+    "    0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 441,
+   "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>sec_company_name</th>\n",
+       "      <th>eia_company_name</th>\n",
+       "      <th>match</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>gamma_company_name_no_legal</th>\n",
+       "      <th>_merge</th>\n",
+       "      <th>predicted_match</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000003153</td>\n",
+       "      <td>195</td>\n",
+       "      <td>alabama power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2575</td>\n",
+       "      <td>80977</td>\n",
+       "      <td>0.999018</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0001868941</td>\n",
+       "      <td>58702</td>\n",
+       "      <td>fluence energy, inc.</td>\n",
+       "      <td>Fluence</td>\n",
+       "      <td>0</td>\n",
+       "      <td>126809</td>\n",
+       "      <td>21615</td>\n",
+       "      <td>0.000002</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0000041091</td>\n",
+       "      <td>7140</td>\n",
+       "      <td>georgia power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>50428</td>\n",
+       "      <td>68242</td>\n",
+       "      <td>0.029853</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0000022198</td>\n",
+       "      <td>4062</td>\n",
+       "      <td>columbus southern power co /oh/</td>\n",
+       "      <td>Columbus Southern Power Co</td>\n",
+       "      <td>1</td>\n",
+       "      <td>129635</td>\n",
+       "      <td>96300</td>\n",
+       "      <td>0.997628</td>\n",
+       "      <td>1</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0001326160</td>\n",
+       "      <td>5416</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>37661</td>\n",
+       "      <td>71555</td>\n",
+       "      <td>0.926352</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>0000030371</td>\n",
+       "      <td>54905</td>\n",
+       "      <td>duke energy carolinas, llc</td>\n",
+       "      <td>Duke Energy Carolinas LLC</td>\n",
+       "      <td>1</td>\n",
+       "      <td>133261</td>\n",
+       "      <td>118543</td>\n",
+       "      <td>0.987916</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>0000869446</td>\n",
+       "      <td>57140</td>\n",
+       "      <td>berkshire realty co inc /de</td>\n",
+       "      <td>Berkshire Wind Power Cooperative Corp</td>\n",
+       "      <td>0</td>\n",
+       "      <td>198821</td>\n",
+       "      <td>89415</td>\n",
+       "      <td>0.000030</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>0000092122</td>\n",
+       "      <td>18195</td>\n",
+       "      <td>southern co</td>\n",
+       "      <td>southern co services inc</td>\n",
+       "      <td>0</td>\n",
+       "      <td>50417</td>\n",
+       "      <td>111824</td>\n",
+       "      <td>0.000063</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>0000092122</td>\n",
+       "      <td>17650</td>\n",
+       "      <td>southern co</td>\n",
+       "      <td>Southern Power Co</td>\n",
+       "      <td>0</td>\n",
+       "      <td>50417</td>\n",
+       "      <td>49613</td>\n",
+       "      <td>0.004315</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>0000075488</td>\n",
+       "      <td>14328</td>\n",
+       "      <td>pacific gas &amp; electric co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2898</td>\n",
+       "      <td>55480</td>\n",
+       "      <td>0.624991</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>6526</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>FirstEnergy</td>\n",
+       "      <td>0</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>69716</td>\n",
+       "      <td>0.999707</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>54776</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>FirstEnergy Nuclear Generation Corp</td>\n",
+       "      <td>0</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>102163</td>\n",
+       "      <td>0.000066</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>6458</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>First Energy Services</td>\n",
+       "      <td>0</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>162033</td>\n",
+       "      <td>0.000066</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>32208</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>First Energy Corp</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>121855</td>\n",
+       "      <td>0.010697</td>\n",
+       "      <td>1</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>0000100122</td>\n",
+       "      <td>24211</td>\n",
+       "      <td>tucson electric power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>715</td>\n",
+       "      <td>41507</td>\n",
+       "      <td>0.999798</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>0000096271</td>\n",
+       "      <td>18454</td>\n",
+       "      <td>tampa electric co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>231716</td>\n",
+       "      <td>47982</td>\n",
+       "      <td>0.989228</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>0000715957</td>\n",
+       "      <td>5248</td>\n",
+       "      <td>dominion energy, inc</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>15937</td>\n",
+       "      <td>71878</td>\n",
+       "      <td>0.998282</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>0001013871</td>\n",
+       "      <td>59883</td>\n",
+       "      <td>nrg energy, inc</td>\n",
+       "      <td>NRG Energy Gas &amp; Wind Holdings Inc</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7168</td>\n",
+       "      <td>17454</td>\n",
+       "      <td>0.002575</td>\n",
+       "      <td>0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>0001013871</td>\n",
+       "      <td>13377</td>\n",
+       "      <td>nrg energy inc</td>\n",
+       "      <td>NRG Energy Inc</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7173</td>\n",
+       "      <td>95029</td>\n",
+       "      <td>0.988801</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>0000788816</td>\n",
+       "      <td>13994</td>\n",
+       "      <td>oglethorpe power corp</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>172902</td>\n",
+       "      <td>56478</td>\n",
+       "      <td>0.999768</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>0000018675</td>\n",
+       "      <td>3266</td>\n",
+       "      <td>central maine power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>126771</td>\n",
+       "      <td>176663</td>\n",
+       "      <td>0.897700</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   central_index_key  utility_id_eia                 sec_company_name                       eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal _merge  predicted_match\n",
+       "0         0000003153             195                 alabama power co                                    NaN      1         2575        80977           0.999018                            2   both              1.0\n",
+       "1         0001868941           58702             fluence energy, inc.                                Fluence      0       126809        21615           0.000002                            0   both              0.0\n",
+       "2         0000041091            7140                 georgia power co                                    NaN      1        50428        68242           0.029853                            2   both              0.0\n",
+       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1       129635        96300           0.997628                            1   both              1.0\n",
+       "4         0001326160            5416                 duke energy corp                                    NaN      1        37661        71555           0.926352                            2   both              0.0\n",
+       "5         0000030371           54905       duke energy carolinas, llc              Duke Energy Carolinas LLC      1       133261       118543           0.987916                            2   both              1.0\n",
+       "6         0000869446           57140      berkshire realty co inc /de  Berkshire Wind Power Cooperative Corp      0       198821        89415           0.000030                            0   both              0.0\n",
+       "7         0000092122           18195                      southern co               southern co services inc      0        50417       111824           0.000063                            0   both              0.0\n",
+       "8         0000092122           17650                      southern co                      Southern Power Co      0        50417        49613           0.004315                            0   both              0.0\n",
+       "9         0000075488           14328        pacific gas & electric co                                    NaN      1         2898        55480           0.624991                            2   both              0.0\n",
+       "10        0001031296            6526                 firstenergy corp                            FirstEnergy      0        14192        69716           0.999707                            2   both              1.0\n",
+       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0        14192       102163           0.000066                            0   both              0.0\n",
+       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0        14192       162033           0.000066                            0   both              0.0\n",
+       "13        0001031296           32208                 firstenergy corp                      First Energy Corp      1        14192       121855           0.010697                            1   both              0.0\n",
+       "14        0000100122           24211         tucson electric power co                                    NaN      1          715        41507           0.999798                            2   both              1.0\n",
+       "15        0000096271           18454                tampa electric co                                    NaN      1       231716        47982           0.989228                            2   both              1.0\n",
+       "16        0000715957            5248             dominion energy, inc                                    NaN      1        15937        71878           0.998282                            2   both              1.0\n",
+       "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0         7168        17454           0.002575                            0   both              0.0\n",
+       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1         7173        95029           0.988801                            2   both              1.0\n",
+       "19        0000788816           13994            oglethorpe power corp                                    NaN      1       172902        56478           0.999768                            2   both              1.0\n",
+       "20        0000018675            3266           central maine power co                                    NaN      1       126771       176663           0.897700                            2   both              0.0"
+      ]
+     },
+     "execution_count": 441,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_df.head(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 442,
+   "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = precision_score(merged_df['match'], merged_df['predicted_match'])\n",
+    "recall = recall_score(merged_df['match'], merged_df['predicted_match'])\n",
+    "accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])\n",
+    "# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])\n",
+    "\n",
+    "# Confusion matrix\n",
+    "conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 443,
+   "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(np.float64(0.8888888888888888),\n",
+       " np.float64(0.6153846153846154),\n",
+       " 0.7142857142857143)"
+      ]
+     },
+     "execution_count": 443,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "precision, recall, accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 444,
+   "id": "08932be5-b90c-440d-9efb-156cb4d63c93",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Predicted Negative</th>\n",
+       "      <th>Predicted Positive</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Negative</th>\n",
+       "      <td>7</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Positive</th>\n",
+       "      <td>5</td>\n",
+       "      <td>8</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Predicted Negative  Predicted Positive\n",
+       "Negative                   7                   1\n",
+       "Positive                   5                   8"
+      ]
+     },
+     "execution_count": 444,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame(\n",
+    "    conf_matrix,\n",
+    "    index=[\"Negative\", \"Positive\"],\n",
+    "    columns=[\"Predicted Negative\", \"Predicted Positive\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 445,
+   "id": "025c80e9-5055-4eaa-a873-38b910cd7f94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 446,
+   "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
        "    }\n",
        "\n",
-       "    if(typeof define === \"function\" && define.amd) {\n",
-       "      requirejs.config({paths});\n",
-       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
-       "    } else {\n",
-       "      maybeLoadScript(\"vega\", \"5\")\n",
-       "        .then(() => maybeLoadScript(\"vega-lite\", \"5.20.1\"))\n",
-       "        .then(() => maybeLoadScript(\"vega-embed\", \"6\"))\n",
-       "        .catch(showError)\n",
-       "        .then(() => displayChart(vegaEmbed));\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-16, 16]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-16, 16]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-f89554aea166da6e147a98b6901fa5cf\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-f89554aea166da6e147a98b6901fa5cf\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.1821114058696376e-05, \"log2_bayes_factor\": -15.483915715308404, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  45,828.2 records.This is equivalent to a starting match weight of -15.484.\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.019774989606985885, \"u_probability\": 2.027918645697988e-06, \"m_probability_description\": \"Amongst matching record comparisons, 1.977% of records (i.e. one in 50.57) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0002028% of records (i.e. one in 493,116) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9751.372250033997, \"log2_bayes_factor\": 13.251389539320668, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 9,751 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.9\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.9'\", \"m_probability\": 0.01846534387207393, \"u_probability\": 0.0019522096829252629, \"m_probability_description\": \"Amongst matching record comparisons, 1.847% of records (i.e. one in 54.16) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.1952% of records (i.e. one in 512) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9.458688804577989, \"log2_bayes_factor\": 3.241640206160788, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.9'` then comparison is 9.459 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.7'\", \"m_probability\": 0.2788273861192321, \"u_probability\": 0.19060658426557237, \"m_probability_description\": \"Amongst matching record comparisons, 27.88% of records (i.e. one in 3.586) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 19.06% of records (i.e. one in 5.246) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1.462842362941364, \"log2_bayes_factor\": 0.5487743118852083, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.7'` then comparison is 1.463 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.6829322804017081, \"u_probability\": 0.8074391781328567, \"m_probability_description\": \"Amongst matching record comparisons, 68.29% of records (i.e. one in 1.464) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 80.74% of records (i.e. one in 1.238) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8458002768467817, \"log2_bayes_factor\": -0.2416110622985932, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.182 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.13355154452924042, \"u_probability\": 5.3722899239283746e-06, \"m_probability_description\": \"Amongst matching record comparisons, 13.36% of records (i.e. one in 7.488) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005372% of records (i.e. one in 186,140) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 24859.333062870825, \"log2_bayes_factor\": 14.601499971201484, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 24,859 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.012195893520133996, \"u_probability\": 1.4814496456893396e-05, \"m_probability_description\": \"Amongst matching record comparisons, 1.22% of records (i.e. one in 81.99) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001481% of records (i.e. one in 67,501) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 823.2405033556894, \"log2_bayes_factor\": 9.685170154213719, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 823 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 2\", \"label_for_charts\": \"Levenshtein distance of street_address <= 2\", \"m_probability\": 0.01029028015491091, \"u_probability\": 0.00012974894149608832, \"m_probability_description\": \"Amongst matching record comparisons, 1.029% of records (i.e. one in 97.18) are in the levenshtein distance of street_address <= 2 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.01297% of records (i.e. one in 7,707) are in the levenshtein distance of street_address <= 2 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 79.30916457781771, \"log2_bayes_factor\": 6.309415681207828, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 2` then comparison is 79.31 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8439622817957148, \"u_probability\": 0.9998500642721231, \"m_probability_description\": \"Amongst matching record comparisons, 84.4% of records (i.e. one in 1.185) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.99% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8440888408704634, \"log2_bayes_factor\": -0.2445332434217164, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.185 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"\\\"zip_code_l\\\" = \\\"zip_code_r\\\"\", \"label_for_charts\": \"Exact match on zip_code\", \"m_probability\": 0.5972484099495287, \"u_probability\": 0.0004124676922519243, \"m_probability_description\": \"Amongst matching record comparisons, 59.72% of records (i.e. one in 1.674) are in the exact match on zip_code comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04125% of records (i.e. one in 2,424) are in the exact match on zip_code comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"zip_code\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1447.9883422838975, \"log2_bayes_factor\": 10.499834272030089, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on zip_code` then comparison is 1,448 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.40275159005047145, \"u_probability\": 0.9995875323077481, \"m_probability_description\": \"Amongst matching record comparisons, 40.28% of records (i.e. one in 2.483) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.96% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.4029177806176101, \"log2_bayes_factor\": -1.3114426223796678, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.482 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.9589078004241631, \"u_probability\": 0.04243320317142917, \"m_probability_description\": \"Amongst matching record comparisons, 95.89% of records (i.e. one in 1.043) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.243% of records (i.e. one in 23.57) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 22.598053617357085, \"log2_bayes_factor\": 4.498126612770657, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 22.6 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.04109219957583696, \"u_probability\": 0.9575667968285708, \"m_probability_description\": \"Amongst matching record comparisons, 4.109% of records (i.e. one in 24.34) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.76% of records (i.e. one in 1.044) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.04291314163349538, \"log2_bayes_factor\": -4.542436666382371, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 23.3 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.8925172913420765, \"u_probability\": 0.00414046488931094, \"m_probability_description\": \"Amongst matching record comparisons, 89.25% of records (i.e. one in 1.12) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.414% of records (i.e. one in 242) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 215.55968114744962, \"log2_bayes_factor\": 7.751943547604383, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 216 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02417396754770653, \"u_probability\": 0.00030828203643208597, \"m_probability_description\": \"Amongst matching record comparisons, 2.417% of records (i.e. one in 41.37) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.03083% of records (i.e. one in 3,244) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 78.41510270103596, \"log2_bayes_factor\": 6.2930596381757935, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 78.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.08330874111021702, \"u_probability\": 0.995551253074257, \"m_probability_description\": \"Amongst matching record comparisons, 8.331% of records (i.e. one in 12) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.56% of records (i.e. one in 1.004) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.08368101677634383, \"log2_bayes_factor\": -3.578955808442296, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 11.95 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}]}}, {\"mode\": \"vega-lite\"});\n",
-       "</script>"
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>sec_company_name</th>\n",
+       "      <th>eia_company_name</th>\n",
+       "      <th>match</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>gamma_company_name_no_legal</th>\n",
+       "      <th>_merge</th>\n",
+       "      <th>predicted_match</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0000041091</td>\n",
+       "      <td>7140</td>\n",
+       "      <td>georgia power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>50428</td>\n",
+       "      <td>68242</td>\n",
+       "      <td>0.029853</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0001326160</td>\n",
+       "      <td>5416</td>\n",
+       "      <td>duke energy corp</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>37661</td>\n",
+       "      <td>71555</td>\n",
+       "      <td>0.926352</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>0000075488</td>\n",
+       "      <td>14328</td>\n",
+       "      <td>pacific gas &amp; electric co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2898</td>\n",
+       "      <td>55480</td>\n",
+       "      <td>0.624991</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>6526</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>FirstEnergy</td>\n",
+       "      <td>0</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>69716</td>\n",
+       "      <td>0.999707</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>32208</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>First Energy Corp</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>121855</td>\n",
+       "      <td>0.010697</td>\n",
+       "      <td>1</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>0000018675</td>\n",
+       "      <td>3266</td>\n",
+       "      <td>central maine power co</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>126771</td>\n",
+       "      <td>176663</td>\n",
+       "      <td>0.897700</td>\n",
+       "      <td>2</td>\n",
+       "      <td>both</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
       "text/plain": [
-       "alt.VConcatChart(...)"
+       "   central_index_key  utility_id_eia           sec_company_name   eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal _merge  predicted_match\n",
+       "2         0000041091            7140           georgia power co                NaN      1        50428        68242           0.029853                            2   both              0.0\n",
+       "4         0001326160            5416           duke energy corp                NaN      1        37661        71555           0.926352                            2   both              0.0\n",
+       "9         0000075488           14328  pacific gas & electric co                NaN      1         2898        55480           0.624991                            2   both              0.0\n",
+       "10        0001031296            6526           firstenergy corp        FirstEnergy      0        14192        69716           0.999707                            2   both              1.0\n",
+       "13        0001031296           32208           firstenergy corp  First Energy Corp      1        14192       121855           0.010697                            1   both              0.0\n",
+       "20        0000018675            3266     central maine power co                NaN      1       126771       176663           0.897700                            2   both              0.0"
       ]
      },
-     "execution_count": 388,
+     "execution_count": 446,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "linker.visualisations.match_weights_chart()"
+    "incorrect_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 389,
-   "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6",
+   "execution_count": 447,
+   "id": "c425a676-aa6e-4d8f-b814-931da392c2ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "recs_to_view = []\n",
+    "for idx, rec in incorrect_df.iterrows():\n",
+    "    full_rec = preds_validation_df[\n",
+    "        (preds_validation_df.record_id_l == rec.record_id_l) & \n",
+    "        (preds_validation_df.record_id_r == rec.record_id_r)\n",
+    "    ].squeeze()\n",
+    "    if full_rec.empty:\n",
+    "        continue\n",
+    "    recs_to_view.append(full_rec.to_dict())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 448,
+   "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1",
    "metadata": {},
    "outputs": [
     {
@@ -2571,23 +4618,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed {\n",
+       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed details,\n",
-       "  #altair-viz-7b23fa91c63f4c55b1f9b687acbced53.vega-embed details summary {\n",
+       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed details,\n",
+       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\"></div>\n",
+       "<div id=\"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-7b23fa91c63f4c55b1f9b687acbced53\");\n",
+       "    if (outputDiv.id !== \"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2633,96 +4680,34 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-3507ca9ecb389fb002b9f229324388dd\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-3507ca9ecb389fb002b9f229324388dd\": [{\"comparison_name\": \"company_name\", \"sql_condition\": \"\\\"company_name_l\\\" = \\\"company_name_r\\\"\", \"label_for_charts\": \"Exact match on company_name\", \"m_probability\": 0.019774989606985885, \"u_probability\": 2.027918645697988e-06, \"m_probability_description\": \"Amongst matching record comparisons, 1.977% of records (i.e. one in 50.57) are in the exact match on company_name comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0002028% of records (i.e. one in 493,116) are in the exact match on company_name comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9751.372250033997, \"log2_bayes_factor\": 13.251389539320668, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on company_name` then comparison is 9,751 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.9\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.9'\", \"m_probability\": 0.01846534387207393, \"u_probability\": 0.0019522096829252629, \"m_probability_description\": \"Amongst matching record comparisons, 1.847% of records (i.e. one in 54.16) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.1952% of records (i.e. one in 512) are in the jaccard distance of 'company_name >= 0.9' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9.458688804577989, \"log2_bayes_factor\": 3.241640206160788, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.9'` then comparison is 9.459 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"jaccard(\\\"company_name_l\\\", \\\"company_name_r\\\") >= 0.7\", \"label_for_charts\": \"Jaccard distance of 'company_name >= 0.7'\", \"m_probability\": 0.2788273861192321, \"u_probability\": 0.19060658426557237, \"m_probability_description\": \"Amongst matching record comparisons, 27.88% of records (i.e. one in 3.586) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 19.06% of records (i.e. one in 5.246) are in the jaccard distance of 'company_name >= 0.7' comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1.462842362941364, \"log2_bayes_factor\": 0.5487743118852083, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `jaccard distance of 'company_name >= 0.7'` then comparison is 1.463 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.6829322804017081, \"u_probability\": 0.8074391781328567, \"m_probability_description\": \"Amongst matching record comparisons, 68.29% of records (i.e. one in 1.464) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 80.74% of records (i.e. one in 1.238) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8458002768467817, \"log2_bayes_factor\": -0.2416110622985932, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.182 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.13355154452924042, \"u_probability\": 5.3722899239283746e-06, \"m_probability_description\": \"Amongst matching record comparisons, 13.36% of records (i.e. one in 7.488) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0005372% of records (i.e. one in 186,140) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 24859.333062870825, \"log2_bayes_factor\": 14.601499971201484, \"comparison_vector_value\": 3, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 24,859 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.012195893520133996, \"u_probability\": 1.4814496456893396e-05, \"m_probability_description\": \"Amongst matching record comparisons, 1.22% of records (i.e. one in 81.99) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001481% of records (i.e. one in 67,501) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 823.2405033556894, \"log2_bayes_factor\": 9.685170154213719, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 823 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 2\", \"label_for_charts\": \"Levenshtein distance of street_address <= 2\", \"m_probability\": 0.01029028015491091, \"u_probability\": 0.00012974894149608832, \"m_probability_description\": \"Amongst matching record comparisons, 1.029% of records (i.e. one in 97.18) are in the levenshtein distance of street_address <= 2 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.01297% of records (i.e. one in 7,707) are in the levenshtein distance of street_address <= 2 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 79.30916457781771, \"log2_bayes_factor\": 6.309415681207828, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 2` then comparison is 79.31 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8439622817957148, \"u_probability\": 0.9998500642721231, \"m_probability_description\": \"Amongst matching record comparisons, 84.4% of records (i.e. one in 1.185) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.99% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8440888408704634, \"log2_bayes_factor\": -0.2445332434217164, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 3, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.185 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 1}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"\\\"zip_code_l\\\" = \\\"zip_code_r\\\"\", \"label_for_charts\": \"Exact match on zip_code\", \"m_probability\": 0.5972484099495287, \"u_probability\": 0.0004124676922519243, \"m_probability_description\": \"Amongst matching record comparisons, 59.72% of records (i.e. one in 1.674) are in the exact match on zip_code comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04125% of records (i.e. one in 2,424) are in the exact match on zip_code comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"zip_code\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 1447.9883422838975, \"log2_bayes_factor\": 10.499834272030089, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on zip_code` then comparison is 1,448 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"zip_code\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.40275159005047145, \"u_probability\": 0.9995875323077481, \"m_probability_description\": \"Amongst matching record comparisons, 40.28% of records (i.e. one in 2.483) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.96% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.4029177806176101, \"log2_bayes_factor\": -1.3114426223796678, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.482 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.9589078004241631, \"u_probability\": 0.04243320317142917, \"m_probability_description\": \"Amongst matching record comparisons, 95.89% of records (i.e. one in 1.043) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.243% of records (i.e. one in 23.57) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 22.598053617357085, \"log2_bayes_factor\": 4.498126612770657, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 22.6 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.04109219957583696, \"u_probability\": 0.9575667968285708, \"m_probability_description\": \"Amongst matching record comparisons, 4.109% of records (i.e. one in 24.34) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.76% of records (i.e. one in 1.044) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.04291314163349538, \"log2_bayes_factor\": -4.542436666382371, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 23.3 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.8925172913420765, \"u_probability\": 0.00414046488931094, \"m_probability_description\": \"Amongst matching record comparisons, 89.25% of records (i.e. one in 1.12) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.414% of records (i.e. one in 242) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 215.55968114744962, \"log2_bayes_factor\": 7.751943547604383, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 216 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02417396754770653, \"u_probability\": 0.00030828203643208597, \"m_probability_description\": \"Amongst matching record comparisons, 2.417% of records (i.e. one in 41.37) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.03083% of records (i.e. one in 3,244) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 78.41510270103596, \"log2_bayes_factor\": 6.2930596381757935, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 78.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.08330874111021702, \"u_probability\": 0.995551253074257, \"m_probability_description\": \"Amongst matching record comparisons, 8.331% of records (i.e. one in 12) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.56% of records (i.e. one in 1.004) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.08368101677634383, \"log2_bayes_factor\": -3.578955808442296, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 11.95 times less likely to be a match\", \"probability_two_random_records_match\": 2.182063790806775e-05, \"comparison_sort_order\": 4}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"layer\": [{\"mark\": \"rule\", \"encoding\": {\"color\": {\"value\": \"black\"}, \"size\": {\"value\": 0.5}, \"y\": {\"field\": \"zero\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"bar\", \"width\": 60}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"(datum.log2_bayes_factor < 0)\", \"value\": \"red\"}, \"value\": \"green\"}, \"opacity\": {\"condition\": {\"test\": \"datum.column_name == 'Prior match weight' || datum.column_name == 'Final score'\", \"value\": 1}, \"value\": 0.5}, \"tooltip\": [{\"field\": \"column_name\", \"title\": \"Comparison column\", \"type\": \"nominal\"}, {\"field\": \"value_l\", \"title\": \"Value (L)\", \"type\": \"nominal\"}, {\"field\": \"value_r\", \"title\": \"Value (R)\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"comparison_vector_value\", \"title\": \"Comparison vector value\", \"type\": \"nominal\"}, {\"field\": \"bayes_factor\", \"format\": \",.4f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"prob\", \"format\": \".4f\", \"title\": \"Cumulative match probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"grid\": true, \"labelAlign\": \"center\", \"labelAngle\": -20, \"labelExpr\": \"datum.value == 'Prior' || datum.value == 'Final score' ? '' : datum.value\", \"labelPadding\": 10, \"tickBand\": \"extent\", \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"grid\": false, \"orient\": \"left\", \"title\": \"Match Weight\"}, \"field\": \"previous_sum\", \"type\": \"quantitative\"}, \"y2\": {\"field\": \"sum\"}}}, {\"mark\": {\"type\": \"text\", \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"white\"}, \"text\": {\"condition\": {\"test\": \"abs(datum.log2_bayes_factor) > 1\", \"field\": \"log2_bayes_factor\", \"format\": \".2f\", \"type\": \"nominal\"}, \"value\": \"\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"orient\": \"left\"}, \"field\": \"center\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -25, \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"black\"}, \"text\": {\"field\": \"column_name\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -13, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_l\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -5, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_r\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}]}, {\"mark\": {\"type\": \"rule\", \"color\": \"black\", \"strokeWidth\": 2, \"x2Offset\": 30, \"xOffset\": -30}, \"encoding\": {\"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"x2\": {\"field\": \"lead\"}, \"y\": {\"axis\": {\"labelExpr\": \"format(1 / (1 + pow(2, -1*datum.value)), '.2r')\", \"orient\": \"right\", \"title\": \"Probability\"}, \"field\": \"sum\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}}], \"data\": {\"name\": \"data-2a99b4d425314fc5f97f7ffbd603dba9\"}, \"height\": 450, \"params\": [{\"name\": \"record_number\", \"bind\": {\"input\": \"range\", \"max\": 5, \"min\": 0, \"step\": 1}, \"value\": 0}], \"resolve\": {\"axis\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Match weights waterfall chart\", \"subtitle\": \"How each comparison contributes to the final match score\"}, \"transform\": [{\"filter\": \"(datum.record_number == record_number)\"}, {\"filter\": \"(datum.bayes_factor !== 1.0)\"}, {\"window\": [{\"op\": \"sum\", \"field\": \"log2_bayes_factor\", \"as\": \"sum\"}, {\"op\": \"lead\", \"field\": \"column_name\", \"as\": \"lead\"}], \"frame\": [null, 0]}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" ? datum.sum - datum.log2_bayes_factor : datum.sum\", \"as\": \"sum\"}, {\"calculate\": \"datum.lead === null ? datum.column_name : datum.lead\", \"as\": \"lead\"}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" || datum.column_name === \\\"Prior match weight\\\" ? 0 : datum.sum - datum.log2_bayes_factor\", \"as\": \"previous_sum\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"top_label\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"bottom_label\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_top\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_bottom\"}, {\"calculate\": \"(datum.sum + datum.previous_sum) / 2\", \"as\": \"center\"}, {\"calculate\": \"(datum.log2_bayes_factor > 0 ? \\\"+\\\" : \\\"\\\") + datum.log2_bayes_factor\", \"as\": \"text_log2_bayes_factor\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? 4 : -4\", \"as\": \"dy\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? \\\"top\\\" : \\\"bottom\\\"\", \"as\": \"baseline\"}, {\"calculate\": \"1. / (1 + pow(2, -1.*datum.sum))\", \"as\": \"prob\"}, {\"calculate\": \"0*datum.sum\", \"as\": \"zero\"}], \"width\": {\"step\": 75}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-2a99b4d425314fc5f97f7ffbd603dba9\": [{\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"georgia power\", \"value_r\": \"georgia power\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.019854514334617032, \"log2_bayes_factor\": -5.654389118542616, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  50.37 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"georgia power\", \"value_r\": \"georgia power\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 0}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"241 ralph mcgill boulevard, ['241' 'ralph' 'mcgill' 'boulevard']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"ga\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"atlanta\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 0}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": -5.022258674429072, \"bayes_factor\": 0.030771558522274426, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 0}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.009163622000592475, \"log2_bayes_factor\": -6.7698663359625515, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  109.13 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 1}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"526 south church street, ['526' 'south' 'church' 'street']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 1.462263504865088, \"log2_bayes_factor\": 0.5482033132788914, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 1.46 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.4300084911960021, \"log2_bayes_factor\": -1.2175629465018993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  2.33 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 1}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 3.652839035481583, \"bayes_factor\": 12.57807323388316, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 1}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 2}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"pacific gas and electric\", \"value_r\": \"pacific gas and electric\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 2}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.019854514334617032, \"log2_bayes_factor\": -5.654389118542616, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  50.37 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"pacific gas and electric\", \"value_r\": \"pacific gas and electric\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 2}, {\"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"77 beale st, ['77' 'beale' 'st']\", \"value_r\": \"77 beale st  rm 1279amc n12e, ['77' 'beale' 'st' 'rm' '1279amc' 'n12e']\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.31774355245472224, \"log2_bayes_factor\": -1.6540652440425645, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison  3.15 times less likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"san francisco\", \"value_r\": \"san francisco\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.45509231984910226, \"log2_bayes_factor\": -1.135768855338252, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  2.20 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"san francisco\", \"value_r\": \"san francisco\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 2}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 0.7369117583317809, \"bayes_factor\": 1.6666044742323236, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 2}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 3}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 3}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.014890885750962774, \"log2_bayes_factor\": -6.069426617821459, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  67.16 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 3}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"value_r\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 2.524753729087804, \"log2_bayes_factor\": 1.3361426705767852, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 2.52 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 7.691701180548207, \"log2_bayes_factor\": 2.9433027156529468, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison 7.69 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 3}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 11.737045144283117, \"bayes_factor\": 3413.5215495150405, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 3}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 4}, {\"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"first energy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 4}, {\"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 4}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 4}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 4}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 4}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 4}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 4}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": -6.5311894633876015, \"bayes_factor\": 0.010812249633775993, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 4}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 5}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"central maine power\", \"value_r\": \"central maine power\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 5}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.02978177150192555, \"log2_bayes_factor\": -5.069426617821459, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  33.58 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"central maine power\", \"value_r\": \"central maine power\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 5}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"edison dr, ['edison' 'dr']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 5}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"me\", \"value_r\": \"me\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 5}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 12.796894615433612, \"log2_bayes_factor\": 3.67772185304552, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 12.80 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"me\", \"value_r\": \"me\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 5}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"augusta\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 5}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 5}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 3.133432627685436, \"bayes_factor\": 8.775203775440346, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 5}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
-       "alt.HConcatChart(...)"
+       "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 389,
+     "execution_count": 448,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# company_name doesn't look good here\n",
-    "linker.visualisations.m_u_parameters_chart()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 285,
-   "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "settings = linker.misc.save_model_to_json(\n",
-    "    \"model_test.json\", overwrite=True\n",
-    ")"
+    "linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b",
-   "metadata": {},
-   "source": [
-    "## Make Predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 390,
-   "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Blocking time: 0.28 seconds\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1680da9f410c424d8e5648fc98c88022",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Predict time: 3.06 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "df_predictions = linker.inference.predict(threshold_match_probability=0.5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 391,
-   "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
+   "id": "a2ba43b6-a664-462a-823f-e3f08585bb51",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "preds_df = df_predictions.as_pandas_dataframe()"
+    "# Save good predictions"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 392,
-   "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
+   "execution_count": 192,
+   "id": "92172e2f-39ba-49e3-8312-98597256ca4f",
    "metadata": {},
    "outputs": [
     {
@@ -2755,11 +4740,17 @@
        "      <th>company_name_l</th>\n",
        "      <th>company_name_r</th>\n",
        "      <th>gamma_company_name</th>\n",
+       "      <th>tf_company_name_l</th>\n",
+       "      <th>tf_company_name_r</th>\n",
        "      <th>bf_company_name</th>\n",
+       "      <th>bf_tf_adj_company_name</th>\n",
        "      <th>street_address_l</th>\n",
        "      <th>street_address_r</th>\n",
        "      <th>gamma_street_address</th>\n",
+       "      <th>tf_street_address_l</th>\n",
+       "      <th>tf_street_address_r</th>\n",
        "      <th>bf_street_address</th>\n",
+       "      <th>bf_tf_adj_street_address</th>\n",
        "      <th>zip_code_l</th>\n",
        "      <th>zip_code_r</th>\n",
        "      <th>gamma_zip_code</th>\n",
@@ -2767,13 +4758,6 @@
        "      <th>tf_zip_code_r</th>\n",
        "      <th>bf_zip_code</th>\n",
        "      <th>bf_tf_adj_zip_code</th>\n",
-       "      <th>state_l</th>\n",
-       "      <th>state_r</th>\n",
-       "      <th>gamma_state</th>\n",
-       "      <th>tf_state_l</th>\n",
-       "      <th>tf_state_r</th>\n",
-       "      <th>bf_state</th>\n",
-       "      <th>bf_tf_adj_state</th>\n",
        "      <th>city_l</th>\n",
        "      <th>city_r</th>\n",
        "      <th>gamma_city</th>\n",
@@ -2783,226 +4767,221 @@
        "      <th>bf_tf_adj_city</th>\n",
        "      <th>company_name_mphone_l</th>\n",
        "      <th>company_name_mphone_r</th>\n",
-       "      <th>report_year_l</th>\n",
-       "      <th>report_year_r</th>\n",
+       "      <th>street_address_list_l</th>\n",
+       "      <th>street_address_list_r</th>\n",
        "      <th>match_key</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>11211</th>\n",
-       "      <td>0.054332</td>\n",
-       "      <td>0.509414</td>\n",
+       "      <th>199607</th>\n",
+       "      <td>4.265490</td>\n",
+       "      <td>0.950575</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>85762</td>\n",
-       "      <td>68295</td>\n",
-       "      <td>citi trends incorporated</td>\n",
-       "      <td>georgia pacific corporation</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1.462842</td>\n",
-       "      <td>104 coleman boulevard</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-1</td>\n",
+       "      <td>20077</td>\n",
+       "      <td>117512</td>\n",
+       "      <td>prt group incorporated</td>\n",
+       "      <td>pratt and whitney power systems</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000010</td>\n",
+       "      <td>0.991220</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>31408</td>\n",
-       "      <td>31326</td>\n",
+       "      <td>80 lamberton rd</td>\n",
+       "      <td>mail stop 191-13</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>0.000103</td>\n",
-       "      <td>0.402918</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.865948</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ga</td>\n",
-       "      <td>ga</td>\n",
+       "      <td>06095</td>\n",
+       "      <td>06095</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.023374</td>\n",
-       "      <td>0.023374</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>1.815434</td>\n",
-       "      <td>savannah</td>\n",
-       "      <td>savannah</td>\n",
+       "      <td>0.000191</td>\n",
+       "      <td>0.000191</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>3.403266</td>\n",
+       "      <td>windsor</td>\n",
+       "      <td>windsor</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000454</td>\n",
-       "      <td>0.000454</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>9.129471</td>\n",
-       "      <td>ST TRNTS INKRPRTT</td>\n",
-       "      <td>JRJ PSFK KRPRXN</td>\n",
-       "      <td>2021</td>\n",
-       "      <td>2008</td>\n",
+       "      <td>0.000279</td>\n",
+       "      <td>0.000279</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>24.882561</td>\n",
+       "      <td>PRT KRP</td>\n",
+       "      <td>PRT ANT HTN PWR SSTMS</td>\n",
+       "      <td>[80, lamberton, rd]</td>\n",
+       "      <td>[mail, stop, 191-13]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11666</th>\n",
-       "      <td>0.098035</td>\n",
-       "      <td>0.516982</td>\n",
+       "      <th>12041</th>\n",
+       "      <td>4.277468</td>\n",
+       "      <td>0.950964</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>94615</td>\n",
-       "      <td>75114</td>\n",
-       "      <td>chicopee bancorp, incorporated</td>\n",
-       "      <td>chicopee city of</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.845800</td>\n",
-       "      <td>70 center street</td>\n",
-       "      <td>725 front street</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.844089</td>\n",
-       "      <td>01013</td>\n",
-       "      <td>01021</td>\n",
+       "      <td>219453</td>\n",
+       "      <td>113555</td>\n",
+       "      <td>cogentrix energy incorporated</td>\n",
+       "      <td>green country energy limited liability company</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>0.402918</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.991220</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>0.987961</td>\n",
-       "      <td>chicopee</td>\n",
-       "      <td>chicopee</td>\n",
+       "      <td>9405 arrowpoint blvd</td>\n",
+       "      <td>9405 arrowpoint blvd</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>35.431042</td>\n",
-       "      <td>XKP BNKRP INKRPRTT</td>\n",
-       "      <td>XKP ST OF</td>\n",
-       "      <td>2012</td>\n",
-       "      <td>2012</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.000534</td>\n",
+       "      <td>0.000534</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.015600</td>\n",
+       "      <td>28273</td>\n",
+       "      <td>28273</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001256</td>\n",
+       "      <td>0.001256</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>0.516567</td>\n",
+       "      <td>charlotte</td>\n",
+       "      <td>chalotte</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.014155</td>\n",
+       "      <td>0.000022</td>\n",
+       "      <td>79.923487</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KJNTRKS ENRJ</td>\n",
+       "      <td>KRN KNTR ENRJ</td>\n",
+       "      <td>[9405, arrowpoint, blvd]</td>\n",
+       "      <td>[9405, arrowpoint, blvd]</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11665</th>\n",
-       "      <td>0.098035</td>\n",
-       "      <td>0.516982</td>\n",
+       "      <th>12805</th>\n",
+       "      <td>4.277468</td>\n",
+       "      <td>0.950964</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>94614</td>\n",
-       "      <td>75115</td>\n",
-       "      <td>chicopee bancorp, incorporated</td>\n",
-       "      <td>chicopee city of</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.845800</td>\n",
-       "      <td>70 center street</td>\n",
-       "      <td>725 front street</td>\n",
+       "      <td>219453</td>\n",
+       "      <td>115755</td>\n",
+       "      <td>cogentrix energy incorporated</td>\n",
+       "      <td>jackson county power limited liability company</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.844089</td>\n",
-       "      <td>01013</td>\n",
-       "      <td>01021</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>0.402918</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.991220</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>0.987961</td>\n",
-       "      <td>chicopee</td>\n",
-       "      <td>chicopee</td>\n",
+       "      <td>9405 arrowpoint blvd</td>\n",
+       "      <td>9405 arrowpoint blvd</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>35.431042</td>\n",
-       "      <td>XKP BNKRP INKRPRTT</td>\n",
-       "      <td>XKP ST OF</td>\n",
-       "      <td>2011</td>\n",
-       "      <td>2011</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.000534</td>\n",
+       "      <td>0.000534</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.015600</td>\n",
+       "      <td>28273</td>\n",
+       "      <td>28273</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001256</td>\n",
+       "      <td>0.001256</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>0.516567</td>\n",
+       "      <td>charlotte</td>\n",
+       "      <td>chaarlotte</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.014155</td>\n",
+       "      <td>0.000011</td>\n",
+       "      <td>79.923487</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KJNTRKS ENRJ</td>\n",
+       "      <td>JKSN KNT PWR</td>\n",
+       "      <td>[9405, arrowpoint, blvd]</td>\n",
+       "      <td>[9405, arrowpoint, blvd]</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11668</th>\n",
-       "      <td>0.098035</td>\n",
-       "      <td>0.516982</td>\n",
+       "      <th>8137</th>\n",
+       "      <td>4.278093</td>\n",
+       "      <td>0.950984</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>94618</td>\n",
-       "      <td>75118</td>\n",
-       "      <td>chicopee bancorp, incorporated</td>\n",
-       "      <td>chicopee city of</td>\n",
+       "      <td>64813</td>\n",
+       "      <td>3879</td>\n",
+       "      <td>rand logistics incorporated</td>\n",
+       "      <td>norridgewock river road solar limited liabilit...</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.845800</td>\n",
-       "      <td>70 center street</td>\n",
-       "      <td>725 front street</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.844089</td>\n",
-       "      <td>01013</td>\n",
-       "      <td>01021</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>0.402918</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.000019</td>\n",
+       "      <td>0.991220</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>ma</td>\n",
+       "      <td>333 washington street</td>\n",
+       "      <td>333 washington street</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001056</td>\n",
+       "      <td>0.001056</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.007888</td>\n",
+       "      <td>07302</td>\n",
+       "      <td>07302</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>0.987961</td>\n",
-       "      <td>chicopee</td>\n",
-       "      <td>chicopee</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>0.278152</td>\n",
+       "      <td>jersey city</td>\n",
+       "      <td>jersey city</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>35.431042</td>\n",
-       "      <td>XKP BNKRP INKRPRTT</td>\n",
-       "      <td>XKP ST OF</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>2008</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.002998</td>\n",
+       "      <td>0.002998</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>2.312506</td>\n",
+       "      <td>RNT LJSTKS</td>\n",
+       "      <td>NRJWK RFR RT SLR</td>\n",
+       "      <td>[333, washington, street]</td>\n",
+       "      <td>[333, washington, street]</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>11669</th>\n",
-       "      <td>0.098035</td>\n",
-       "      <td>0.516982</td>\n",
+       "      <th>8136</th>\n",
+       "      <td>4.278093</td>\n",
+       "      <td>0.950984</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>94620</td>\n",
-       "      <td>75116</td>\n",
-       "      <td>chicopee bancorp, incorporated</td>\n",
-       "      <td>chicopee city of</td>\n",
+       "      <td>64813</td>\n",
+       "      <td>5193</td>\n",
+       "      <td>rand logistics incorporated</td>\n",
+       "      <td>anderson solar farm limited liability company</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.845800</td>\n",
-       "      <td>70 center street</td>\n",
-       "      <td>p o box 405</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.844089</td>\n",
-       "      <td>01013</td>\n",
-       "      <td>01021</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>0.402918</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.991220</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>ma</td>\n",
+       "      <td>333 washington street</td>\n",
+       "      <td>333 washington street</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001056</td>\n",
+       "      <td>0.001056</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.007888</td>\n",
+       "      <td>07302</td>\n",
+       "      <td>07302</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>0.042950</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>0.987961</td>\n",
-       "      <td>chicopee</td>\n",
-       "      <td>chicopee</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>0.002332</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>0.278152</td>\n",
+       "      <td>jersey city</td>\n",
+       "      <td>jersey city</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>0.000117</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>35.431042</td>\n",
-       "      <td>XKP BNKRP INKRPRTT</td>\n",
-       "      <td>XKP ST OF</td>\n",
-       "      <td>2010</td>\n",
-       "      <td>2010</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.002998</td>\n",
+       "      <td>0.002998</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>2.312506</td>\n",
+       "      <td>RNT LJSTKS</td>\n",
+       "      <td>ANTRSN SLR FRM</td>\n",
+       "      <td>[333, washington, street]</td>\n",
+       "      <td>[333, washington, street]</td>\n",
+       "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -3045,258 +5024,252 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
-       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10043</th>\n",
-       "      <td>45.026591</td>\n",
+       "      <th>199278</th>\n",
+       "      <td>27.514584</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>177698</td>\n",
-       "      <td>67483</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9751.372250</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>3</td>\n",
-       "      <td>24859.333063</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>1447.988342</td>\n",
-       "      <td>2.894003</td>\n",
-       "      <td>vt</td>\n",
-       "      <td>vt</td>\n",
+       "      <td>27759</td>\n",
+       "      <td>142183</td>\n",
+       "      <td>diamond brands incorporated</td>\n",
+       "      <td>diamond brands incorporated</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>0.000029</td>\n",
+       "      <td>7612.680596</td>\n",
+       "      <td>0.037986</td>\n",
+       "      <td>1800 cloquet avenue</td>\n",
+       "      <td>1800 cloquet avenue</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>0.000036</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.233998</td>\n",
+       "      <td>55720</td>\n",
+       "      <td>55720</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>15.835981</td>\n",
-       "      <td>colchester</td>\n",
-       "      <td>colchester</td>\n",
+       "      <td>0.000078</td>\n",
+       "      <td>0.000078</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>8.265075</td>\n",
+       "      <td>cloquet</td>\n",
+       "      <td>cloquet</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>20.959208</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>2001</td>\n",
-       "      <td>2001</td>\n",
+       "      <td>0.000078</td>\n",
+       "      <td>0.000078</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>88.866289</td>\n",
+       "      <td>TMNT BRNTS</td>\n",
+       "      <td>TMNT BRNTS</td>\n",
+       "      <td>[1800, cloquet, avenue]</td>\n",
+       "      <td>[1800, cloquet, avenue]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10051</th>\n",
-       "      <td>45.026591</td>\n",
+       "      <th>485070</th>\n",
+       "      <td>27.655362</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>177702</td>\n",
-       "      <td>67479</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9751.372250</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>3</td>\n",
-       "      <td>24859.333063</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>1447.988342</td>\n",
-       "      <td>2.894003</td>\n",
-       "      <td>vt</td>\n",
-       "      <td>vt</td>\n",
+       "      <td>50420</td>\n",
+       "      <td>95697</td>\n",
+       "      <td>gulf power company</td>\n",
+       "      <td>gulf power company</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>7612.680596</td>\n",
+       "      <td>0.028490</td>\n",
+       "      <td>one energy place</td>\n",
+       "      <td>one energy place</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.350997</td>\n",
+       "      <td>32520</td>\n",
+       "      <td>32520</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>15.835981</td>\n",
-       "      <td>colchester</td>\n",
-       "      <td>colchester</td>\n",
+       "      <td>0.000056</td>\n",
+       "      <td>0.000056</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>11.571104</td>\n",
+       "      <td>pensacola</td>\n",
+       "      <td>pensacola</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>20.959208</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>2005</td>\n",
-       "      <td>2005</td>\n",
+       "      <td>0.000111</td>\n",
+       "      <td>0.000111</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>62.206402</td>\n",
+       "      <td>KLF PWR</td>\n",
+       "      <td>KLF PWR</td>\n",
+       "      <td>[one, energy, place]</td>\n",
+       "      <td>[one, energy, place]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10050</th>\n",
-       "      <td>45.026591</td>\n",
+       "      <th>331565</th>\n",
+       "      <td>27.977290</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>177701</td>\n",
-       "      <td>67480</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9751.372250</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>3</td>\n",
-       "      <td>24859.333063</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>1447.988342</td>\n",
-       "      <td>2.894003</td>\n",
-       "      <td>vt</td>\n",
-       "      <td>vt</td>\n",
+       "      <td>170775</td>\n",
+       "      <td>78563</td>\n",
+       "      <td>berry petroleum company</td>\n",
+       "      <td>berry petroleum company</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000096</td>\n",
+       "      <td>0.000096</td>\n",
+       "      <td>7612.680596</td>\n",
+       "      <td>0.011396</td>\n",
+       "      <td>28700 hovey hills rd</td>\n",
+       "      <td>28700 hovey hills rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.350997</td>\n",
+       "      <td>93268</td>\n",
+       "      <td>93268</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>15.835981</td>\n",
-       "      <td>colchester</td>\n",
-       "      <td>colchester</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>14.463881</td>\n",
+       "      <td>taft</td>\n",
+       "      <td>taft</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>20.959208</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>2004</td>\n",
-       "      <td>2004</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>155.516006</td>\n",
+       "      <td>BR PTRLM</td>\n",
+       "      <td>BR PTRLM</td>\n",
+       "      <td>[28700, hovey, hills, rd]</td>\n",
+       "      <td>[28700, hovey, hills, rd]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10049</th>\n",
-       "      <td>45.026591</td>\n",
+       "      <th>869341</th>\n",
+       "      <td>28.977290</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>177699</td>\n",
-       "      <td>67482</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9751.372250</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>3</td>\n",
-       "      <td>24859.333063</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>1447.988342</td>\n",
-       "      <td>2.894003</td>\n",
-       "      <td>vt</td>\n",
-       "      <td>vt</td>\n",
+       "      <td>39609</td>\n",
+       "      <td>141382</td>\n",
+       "      <td>eme homer city generation limited partnership</td>\n",
+       "      <td>eme homer city generation limited partnership</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>0.000038</td>\n",
+       "      <td>7612.680596</td>\n",
+       "      <td>0.028490</td>\n",
+       "      <td>1750 power plant road</td>\n",
+       "      <td>1750 power plant road</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.350997</td>\n",
+       "      <td>15748</td>\n",
+       "      <td>15748</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>15.835981</td>\n",
-       "      <td>colchester</td>\n",
-       "      <td>colchester</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>0.000045</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>14.463881</td>\n",
+       "      <td>homer city</td>\n",
+       "      <td>homer city</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>20.959208</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>2002</td>\n",
-       "      <td>2002</td>\n",
+       "      <td>0.000056</td>\n",
+       "      <td>0.000056</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>124.412805</td>\n",
+       "      <td>EM HMR ST JNRXN</td>\n",
+       "      <td>EM HMR ST JNRXN</td>\n",
+       "      <td>[1750, power, plant, road]</td>\n",
+       "      <td>[1750, power, plant, road]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10035</th>\n",
-       "      <td>45.026591</td>\n",
+       "      <th>73212</th>\n",
+       "      <td>29.544331</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>177700</td>\n",
-       "      <td>67481</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>green mountain power corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>9751.372250</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>163 acorn lane</td>\n",
-       "      <td>3</td>\n",
-       "      <td>24859.333063</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>05446</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>0.000143</td>\n",
-       "      <td>1447.988342</td>\n",
-       "      <td>2.894003</td>\n",
-       "      <td>vt</td>\n",
-       "      <td>vt</td>\n",
+       "      <td>224681</td>\n",
+       "      <td>50859</td>\n",
+       "      <td>selkirk cogen partners limited partnership</td>\n",
+       "      <td>selkirk cogen partners limited partnership</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>0.000058</td>\n",
+       "      <td>7612.680596</td>\n",
+       "      <td>0.018993</td>\n",
+       "      <td>24 power park drive</td>\n",
+       "      <td>24 power park drive</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>14580.390627</td>\n",
+       "      <td>0.350997</td>\n",
+       "      <td>12158</td>\n",
+       "      <td>12158</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>0.002680</td>\n",
-       "      <td>22.598054</td>\n",
-       "      <td>15.835981</td>\n",
-       "      <td>colchester</td>\n",
-       "      <td>colchester</td>\n",
+       "      <td>0.000034</td>\n",
+       "      <td>0.000034</td>\n",
+       "      <td>1148.002189</td>\n",
+       "      <td>19.285174</td>\n",
+       "      <td>selkirk</td>\n",
+       "      <td>selkirk</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>0.000198</td>\n",
-       "      <td>215.559681</td>\n",
-       "      <td>20.959208</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>KRN MNTN PWR KRPRXN</td>\n",
-       "      <td>2003</td>\n",
-       "      <td>2003</td>\n",
+       "      <td>0.000033</td>\n",
+       "      <td>0.000033</td>\n",
+       "      <td>126.999683</td>\n",
+       "      <td>207.354675</td>\n",
+       "      <td>SLKRK KJN PRTNRS</td>\n",
+       "      <td>SLKRK KJN PRTNRS</td>\n",
+       "      <td>[24, power, park, drive]</td>\n",
+       "      <td>[24, power, park, drive]</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>12713 rows × 40 columns</p>\n",
+       "<p>3014 rows × 39 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "       match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                    company_name_l                    company_name_r  gamma_company_name  bf_company_name       street_address_l  street_address_r  gamma_street_address  bf_street_address zip_code_l zip_code_r  gamma_zip_code  tf_zip_code_l  tf_zip_code_r  bf_zip_code  bf_tf_adj_zip_code state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state      city_l      city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l company_name_mphone_r  report_year_l  report_year_r match_key\n",
-       "11211      0.054332           0.509414  __splink__input_table_0  __splink__input_table_1        85762        68295          citi trends incorporated       georgia pacific corporation                   1         1.462842  104 coleman boulevard              None                    -1           1.000000      31408      31326               0       0.000045       0.000103     0.402918            1.000000      ga      ga            1    0.023374    0.023374  22.598054         1.815434    savannah    savannah           2   0.000454   0.000454  215.559681        9.129471     ST TRNTS INKRPRTT       JRJ PSFK KRPRXN           2021           2008         0\n",
-       "11666      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94615        75114    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2012           2012         0\n",
-       "11665      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94614        75115    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2011           2011         0\n",
-       "11668      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94618        75118    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street  725 front street                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2008           2008         0\n",
-       "11669      0.098035           0.516982  __splink__input_table_0  __splink__input_table_1        94620        75116    chicopee bancorp, incorporated                  chicopee city of                   0         0.845800       70 center street       p o box 405                     0           0.844089      01013      01021               0       0.000036       0.000061     0.402918            1.000000      ma      ma            1    0.042950    0.042950  22.598054         0.987961    chicopee    chicopee           2   0.000117   0.000117  215.559681       35.431042    XKP BNKRP INKRPRTT             XKP ST OF           2010           2010         0\n",
-       "...             ...                ...                      ...                      ...          ...          ...                               ...                               ...                 ...              ...                    ...               ...                   ...                ...        ...        ...             ...            ...            ...          ...                 ...     ...     ...          ...         ...         ...        ...              ...         ...         ...         ...        ...        ...         ...             ...                   ...                   ...            ...            ...       ...\n",
-       "10043     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177698        67483  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2001           2001         0\n",
-       "10051     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177702        67479  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2005           2005         0\n",
-       "10050     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177701        67480  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2004           2004         0\n",
-       "10049     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177699        67482  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2002           2002         0\n",
-       "10035     45.026591           1.000000  __splink__input_table_0  __splink__input_table_1       177700        67481  green mountain power corporation  green mountain power corporation                   3      9751.372250         163 acorn lane    163 acorn lane                     3       24859.333063      05446      05446               1       0.000143       0.000143  1447.988342            2.894003      vt      vt            1    0.002680    0.002680  22.598054        15.835981  colchester  colchester           2   0.000198   0.000198  215.559681       20.959208   KRN MNTN PWR KRPRXN   KRN MNTN PWR KRPRXN           2003           2003         0\n",
-       "\n",
-       "[12713 rows x 40 columns]"
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                                 company_name_l                                     company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address zip_code_l zip_code_r  gamma_zip_code  tf_zip_code_l  tf_zip_code_r  bf_zip_code  bf_tf_adj_zip_code       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l  company_name_mphone_r       street_address_list_l       street_address_list_r match_key\n",
+       "199607      4.265490           0.950575  __splink__input_table_0  __splink__input_table_1        20077       117512                         prt group incorporated                    pratt and whitney power systems                   0           0.000019           0.000010         0.991220                1.000000        80 lamberton rd       mail stop 191-13                     0             0.000036             0.000012           0.865948                  1.000000      06095      06095               1       0.000191       0.000191  1148.002189            3.403266      windsor      windsor           2   0.000279   0.000279  126.999683       24.882561               PRT KRP  PRT ANT HTN PWR SSTMS         [80, lamberton, rd]        [mail, stop, 191-13]         0\n",
+       "12041       4.277468           0.950964  __splink__input_table_0  __splink__input_table_1       219453       113555                  cogentrix energy incorporated     green country energy limited liability company                   0           0.000019           0.000038         0.991220                1.000000   9405 arrowpoint blvd   9405 arrowpoint blvd                     2             0.000534             0.000534       14580.390627                  0.015600      28273      28273               1       0.001256       0.001256  1148.002189            0.516567    charlotte     chalotte           1   0.014155   0.000022   79.923487        1.000000          KJNTRKS ENRJ          KRN KNTR ENRJ    [9405, arrowpoint, blvd]    [9405, arrowpoint, blvd]         1\n",
+       "12805       4.277468           0.950964  __splink__input_table_0  __splink__input_table_1       219453       115755                  cogentrix energy incorporated     jackson county power limited liability company                   0           0.000019           0.000029         0.991220                1.000000   9405 arrowpoint blvd   9405 arrowpoint blvd                     2             0.000534             0.000534       14580.390627                  0.015600      28273      28273               1       0.001256       0.001256  1148.002189            0.516567    charlotte   chaarlotte           1   0.014155   0.000011   79.923487        1.000000          KJNTRKS ENRJ           JKSN KNT PWR    [9405, arrowpoint, blvd]    [9405, arrowpoint, blvd]         1\n",
+       "8137        4.278093           0.950984  __splink__input_table_0  __splink__input_table_1        64813         3879                    rand logistics incorporated  norridgewock river road solar limited liabilit...                   0           0.000029           0.000019         0.991220                1.000000  333 washington street  333 washington street                     2             0.001056             0.001056       14580.390627                  0.007888      07302      07302               1       0.002332       0.002332  1148.002189            0.278152  jersey city  jersey city           2   0.002998   0.002998  126.999683        2.312506            RNT LJSTKS       NRJWK RFR RT SLR   [333, washington, street]   [333, washington, street]         1\n",
+       "8136        4.278093           0.950984  __splink__input_table_0  __splink__input_table_1        64813         5193                    rand logistics incorporated      anderson solar farm limited liability company                   0           0.000029           0.000029         0.991220                1.000000  333 washington street  333 washington street                     2             0.001056             0.001056       14580.390627                  0.007888      07302      07302               1       0.002332       0.002332  1148.002189            0.278152  jersey city  jersey city           2   0.002998   0.002998  126.999683        2.312506            RNT LJSTKS         ANTRSN SLR FRM   [333, washington, street]   [333, washington, street]         1\n",
+       "...              ...                ...                      ...                      ...          ...          ...                                            ...                                                ...                 ...                ...                ...              ...                     ...                    ...                    ...                   ...                  ...                  ...                ...                       ...        ...        ...             ...            ...            ...          ...                 ...          ...          ...         ...        ...        ...         ...             ...                   ...                    ...                         ...                         ...       ...\n",
+       "199278     27.514584           1.000000  __splink__input_table_0  __splink__input_table_1        27759       142183                    diamond brands incorporated                        diamond brands incorporated                   2           0.000029           0.000029      7612.680596                0.037986    1800 cloquet avenue    1800 cloquet avenue                     2             0.000036             0.000036       14580.390627                  0.233998      55720      55720               1       0.000078       0.000078  1148.002189            8.265075      cloquet      cloquet           2   0.000078   0.000078  126.999683       88.866289            TMNT BRNTS             TMNT BRNTS     [1800, cloquet, avenue]     [1800, cloquet, avenue]         0\n",
+       "485070     27.655362           1.000000  __splink__input_table_0  __splink__input_table_1        50420        95697                             gulf power company                                 gulf power company                   2           0.000038           0.000038      7612.680596                0.028490       one energy place       one energy place                     2             0.000024             0.000024       14580.390627                  0.350997      32520      32520               1       0.000056       0.000056  1148.002189           11.571104    pensacola    pensacola           2   0.000111   0.000111  126.999683       62.206402               KLF PWR                KLF PWR        [one, energy, place]        [one, energy, place]         0\n",
+       "331565     27.977290           1.000000  __splink__input_table_0  __splink__input_table_1       170775        78563                        berry petroleum company                            berry petroleum company                   2           0.000096           0.000096      7612.680596                0.011396   28700 hovey hills rd   28700 hovey hills rd                     2             0.000024             0.000024       14580.390627                  0.350997      93268      93268               1       0.000045       0.000045  1148.002189           14.463881         taft         taft           2   0.000045   0.000045  126.999683      155.516006              BR PTRLM               BR PTRLM   [28700, hovey, hills, rd]   [28700, hovey, hills, rd]         0\n",
+       "869341     28.977290           1.000000  __splink__input_table_0  __splink__input_table_1        39609       141382  eme homer city generation limited partnership      eme homer city generation limited partnership                   2           0.000038           0.000038      7612.680596                0.028490  1750 power plant road  1750 power plant road                     2             0.000024             0.000024       14580.390627                  0.350997      15748      15748               1       0.000045       0.000045  1148.002189           14.463881   homer city   homer city           2   0.000056   0.000056  126.999683      124.412805       EM HMR ST JNRXN        EM HMR ST JNRXN  [1750, power, plant, road]  [1750, power, plant, road]         0\n",
+       "73212      29.544331           1.000000  __splink__input_table_0  __splink__input_table_1       224681        50859     selkirk cogen partners limited partnership         selkirk cogen partners limited partnership                   2           0.000058           0.000058      7612.680596                0.018993    24 power park drive    24 power park drive                     2             0.000024             0.000024       14580.390627                  0.350997      12158      12158               1       0.000034       0.000034  1148.002189           19.285174      selkirk      selkirk           2   0.000033   0.000033  126.999683      207.354675      SLKRK KJN PRTNRS       SLKRK KJN PRTNRS    [24, power, park, drive]    [24, power, park, drive]         0\n",
+       "\n",
+       "[3014 rows x 39 columns]"
       ]
      },
-     "execution_count": 392,
+     "execution_count": 192,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preds_df.sort_values(by=\"match_probability\")"
+    "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4f63fb3d-5fac-476d-9271-347412121902",
+   "id": "288ffe20-c69e-4c96-8835-765c06303bf2",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 6a33b78..2974628 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -26,13 +26,14 @@
     mlflow_train_test_io_managers,
 )
 
-from . import basic_10k, ex_21, extract
+from . import basic_10k, ex_21, extract, sec_output_table
 from .utils.cloud import cloud_interface_resource
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
 ex21_data_assets = load_assets_from_modules([ex_21.data])
 shared_assets = load_assets_from_modules([extract])
+sec_output_assets = load_assets_from_modules([sec_output_table])
 
 basic_10k_production_job = model_jobs.create_production_model_job(
     "basic_10k_extraction",
@@ -53,6 +54,9 @@
     ],
 )
 
+sec_output_table_production_job = model_jobs.create_production_model_job(
+    "sec_output_table_creation", sec_output_table.production_assets
+)
 
 exhibit21_extractor = define_dagstermill_asset(
     name="train_exhibit21_extractor",
@@ -97,13 +101,15 @@
     + ex21_assets
     + shared_assets
     + [exhibit21_extractor, exhibit21_layout_classifier]
-    + ex21_data_assets,
+    + ex21_data_assets
+    + sec_output_assets,
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
         ex21_production_job,
         ex21_training_job,
         ex21_layout_classifier_training_job,
+        sec_output_table_production_job,
     ],
     resources={
         "cloud_interface": cloud_interface_resource,
diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
new file mode 100644
index 0000000..6f0f900
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
@@ -0,0 +1,327 @@
+"""Module for creating an SEC 10K output table with filing companies and subsidiary companies."""
+
+import re
+from importlib import resources
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from dagster import AssetIn, AssetOut, multi_asset
+
+from mozilla_sec_eia.models.sec10k.utils.cloud import (
+    GCSArchive,
+    convert_ex21_id_to_filename,
+)
+from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import (
+    company_name_cleaner,
+)
+
+# TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters?
+archive = GCSArchive()
+md = archive.get_metadata()
+
+INVALID_NAMES = [
+    "llc",
+    "limited liability company",
+    "limited",
+    "ltd",
+    "iiii",
+    "inc",
+    "incorporated",
+    "partnership",
+    "i",
+    "name",
+    "company",
+    "&",
+    "",
+]
+
+
+def _remove_weird_sec_cols(sec_df) -> pd.DataFrame:
+    weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]
+    for weird_col in weird_cols:
+        if weird_col not in sec_df:
+            continue
+        normal_col = weird_col[1:]
+        sec_df.loc[:, normal_col] = sec_df[normal_col].where(
+            sec_df[weird_col].isnull(), sec_df[weird_col]
+        )
+        sec_df = sec_df.drop(columns=[weird_col])
+    return sec_df
+
+
+def _add_report_year_to_sec(sec_df) -> pd.DataFrame:
+    """Merge metadata on to get a report year for extracted SEC data.
+
+    Expects filename to be the index of the SEC dataframe.
+    """
+    sec_df = sec_df.merge(
+        md[["date_filed"]], how="left", left_index=True, right_index=True
+    )
+    sec_df.loc[:, "report_year"] = (
+        sec_df["report_date"].astype("datetime64[ns]").dt.year
+    )
+    return sec_df
+
+
+def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame:
+    """Keep only the most recent record for each unique SEC CIK.
+
+    Note that this drops old records for companies that have changed
+    names or addresses across time.
+    TODO: create an asset that tracks name and address chnages across
+    time.
+    """
+    sec_df = _add_report_year_to_sec(sec_df)
+    sec_df = (
+        sec_df.sort_values(by="report_year", ascending=False)
+        .groupby("central_index_key")
+        .first()
+    )
+    return sec_df
+
+
+def get_sec_state_code_dict() -> dict[str, str]:
+    """Create a dictionary mapping state codes to their names.
+
+    Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes
+    Published by SEC and reports valid state codes
+    for filers of Form D. Used to standardize the state codes
+    in the SEC 10K filings. The expanded names of the state codes
+    are comments in the XML file, so we have to read the XML in as
+    text and parse it.
+    """
+    # TODO: make a check to see if SEC has published a new version of this table
+    xml_filepath = (
+        resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml"
+    )
+    with Path.open(xml_filepath) as file:
+        xml_text = file.read()
+
+    pattern = r'<xs:enumeration value="(.*?)"/>.*?<!--\s*(.*?)\s*-->'
+    state_code_dict = {
+        code.lower(): name.lower()
+        for code, name in re.findall(pattern, xml_text, re.DOTALL)
+    }
+    return state_code_dict
+
+
+def clean_loc_of_incorporation(df) -> pd.DataFrame:
+    """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe.
+
+    Arguments:
+        df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation
+            column.
+    """
+    state_code_to_name = get_sec_state_code_dict()
+    df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace(
+        state_code_to_name
+    )
+    df["loc_of_incorporation"] = (
+        df["loc_of_incorporation"]
+        .fillna(pd.NA)
+        .apply(lambda x: x.str.strip().str.lower())
+        .replace("", pd.NA)
+    )
+    return df
+
+
+def clean_company_name(df) -> pd.DataFrame:
+    """Clean company name column in SEC basic 10K or Ex. 21 dataframe.
+
+    Arguments:
+        df: Ex. 21 or SEC 10K basic info dataframe with company_name
+            column.
+    """
+    df["company_name"] = (
+        df["company_name"]
+        .fillna(pd.NA)
+        .apply(lambda x: x.str.strip().str.lower())
+        .replace("", pd.NA)
+    )
+    df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
+        df[["company_name"]]
+    ).str.strip()
+    df = df[
+        (~df["company_name"].isin(INVALID_NAMES))
+        & (~df["company_name_clean"].isin(INVALID_NAMES))
+    ]
+    df = df.fillna(np.nan)
+
+    return df
+
+
+def add_parent_company_cik(ex21_df: pd.DataFrame) -> pd.DataFrame:
+    """Add the CIK of the parent company to Ex. 21 subsidiaries."""
+    ex21_df = ex21_df.merge(
+        md["cik"], how="left", left_on="filename", right_index=True
+    ).rename(columns={"cik": "parent_company_cik"})
+
+
+def match_ex21_subsidiaries_to_filer_company(
+    basic10k_df: pd.DataFrame, ex21_df: pd.DataFrame
+) -> pd.DataFrame:
+    """Match Ex. 21 subsidiaries to filer companies.
+
+    We want to assign CIKs to Ex. 21 subsidiaries if they in turn
+    file a 10k. To do this, we merge the Ex. 21 subsidiaries to 10k
+    filers on comapny name. If there are multiple matches with the same
+    company name we take the company with the most overlap in location of
+    incorporation and nearest report years. Then we merge the CIK back onto
+    the Ex. 21 df.
+
+    Returns:
+        A dataframe of the Ex. 21 subsidiaries with a column for the
+        subsidiaries CIK (null if the subsidiary doesn't file).
+    """
+    basic10k_df = basic10k_df.drop_duplicates(
+        subset=[
+            "central_index_key",
+            "company_name",
+            "loc_of_incorporation",
+            "report_year",
+        ]
+    )
+    merged_df = basic10k_df.merge(
+        ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21")
+    )
+    # split up the location of incorporation on whitespace, creating a column
+    # with lists of word tokens
+    merged_df.loc[:, "loc_tokens_sec"] = (
+        merged_df["loc_of_incorporation_sec"].fillna("").str.lower().str.split()
+    )
+    merged_df.loc[:, "loc_tokens_ex21"] = (
+        merged_df["loc_of_incorporation_ex21"].fillna("").str.lower().str.split()
+    )
+    # get the number of words overlapping between location of incorporation tokens
+    merged_df["loc_overlap"] = merged_df.apply(
+        lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])),
+        axis=1,
+    )
+    # get the difference in report years
+    merged_df["report_year_diff"] = merged_df.apply(
+        lambda row: abs(int(row["report_year_sec"]) - int(row["report_year_ex21"])),
+        axis=1,
+    )
+    merged_df = merged_df.sort_values(
+        by=[
+            "company_name",
+            "loc_of_incorporation_ex21",
+            "loc_overlap",
+            "report_year_diff",
+        ],
+        ascending=[True, True, False, True],
+    )
+    # Select the row with the highest loc overlap and nearest report years
+    # for each company name and location pair
+    closest_match_df = merged_df.groupby(
+        ["company_name", "loc_of_incorporation_ex21"], as_index=False
+    ).first()
+    ex21_with_cik_df = ex21_df.merge(
+        closest_match_df[
+            ["company_name", "central_index_key", "loc_of_incorporation_ex21"]
+        ].rename(columns={"loc_of_incorporation_ex21": "loc_of_incorporation"}),
+        how="left",
+        on=["company_name", "loc_of_incorporation"],
+    ).rename(columns={"central_index_key": "subsidiary_cik"})
+    # if a subsidiary doesn't have a CIK and has a null location
+    # but its company name was assigned a CIK (with a different location)
+    # then assign that CIK to the subsidiary
+    ex21_with_cik_df = ex21_with_cik_df.merge(
+        closest_match_df[["company_name", "central_index_key"]],
+        how="left",
+        on="company_name",
+    ).rename(columns={"central_index_key": "company_name_merge_cik"})
+    ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where(
+        ~(ex21_with_cik_df.subsidiary_cik.isnull())
+        | ~(ex21_with_cik_df.loc_of_incorporation.isnull()),
+        ex21_with_cik_df["company_name_merge_cik"],
+    )
+    ex21_with_cik_df = ex21_with_cik_df.rename(
+        columns={"subsidiary_cik": "central_index_key"}
+    )
+    return ex21_with_cik_df
+
+
+@multi_asset(
+    ins={
+        "ex21_df": AssetIn("ex21_company_ownership_info"),
+    },
+    outs={
+        "clean_ex21_subsidiary_table": AssetOut(
+            io_manager_key="pandas_parquet_io_manager",
+        )
+    },
+)
+def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame:
+    """Clean Ex. 21 table of subsidiaries before combing with basic 10k table."""
+    ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
+    ex21_df = clean_loc_of_incorporation(ex21_df)
+    ex21_df = clean_company_name(ex21_df)
+    ex21_df = add_parent_company_cik(ex21_df)
+    # flatten out the Ex. 21 table
+    ex21_df = ex21_df.drop_duplicates(
+        subset=["parent_company_cik", "company_name", "loc_of_incorporation"]
+    )
+    return ex21_df
+
+
+@multi_asset(
+    ins={
+        "basic_10k_df": AssetIn("basic_10k_company_info"),
+        "clean_ex21_df": AssetIn("clean_ex21_subsidiary_table"),
+        # specify an io_manager_key?
+    },
+    outs={
+        "out_sec_10k__parents_and_subsidiaries": AssetOut(
+            io_manager_key="pandas_parquet_io_manager",
+            # specify a dagster_type?
+        ),
+    },
+)
+def sec_output_table(
+    basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame
+) -> pd.DataFrame:
+    """Asset for creating an SEC 10K output table.
+
+    Flatten the table across time to only keep the most recent record
+    for each CIK. Add in Ex. 21 subsidiaries and link them to already present
+    filing companies. Create an sec_company_id for subsidiaries that aren't linked
+    to a CIK.
+    """
+    basic_10k_df = basic_10k_df.reset_index().pivot_table(
+        values="value", index="filename", columns="key", aggfunc="first"
+    )
+    basic_10k_df.columns.name = None
+    basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
+
+    # add a location of incorporation to better match it to Ex. 21 subsidiaries
+    basic_10k_df = clean_loc_of_incorporation(basic_10k_df)
+    basic_10k_df = clean_company_name(basic_10k_df)
+    ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
+        basic10k_df=basic_10k_df, ex21_df=clean_ex21_df
+    )
+    basic_10k_df = basic_10k_df.merge(
+        ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]],
+        how="left",
+        on="central_index_key",
+    )
+    basic_10k_df.loc[:, "files_10k"] = True
+    basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"]
+    ex21_non_filing_subs_df = ex21_df_with_cik[
+        ex21_df_with_cik["central_index_key"].isnull()
+    ]
+    ex21_non_filing_subs_df.loc[:, "files_10k"] = False
+    # create a sec_company_id for the subsidiaries that don't have a CIK
+    ex21_non_filing_subs_df.loc[:, "sec_company_id"] = (
+        ex21_non_filing_subs_df["company_name"].str
+        + ex21_non_filing_subs_df["loc_of_incorporation"].str
+    )
+    out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df])
+    # this drops records for earlier company names and addresses
+    # that have since changed, so we lose some information
+    out_df = _flatten_sec_companies_across_time(out_df)
+    return out_df
+
+
+production_assets = [sec_output_table]
diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
index 1a5ec96..f2e284a 100644
--- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
+++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py
@@ -29,6 +29,20 @@ def _compute_md5(file_path: Path) -> str:
     return base64.b64encode(hash_md5.digest()).decode()
 
 
+def convert_ex21_id_to_filename(df: pd.DataFrame, id_col_name: str = "id"):
+    """Convert the ID column to GCS archive filenames.
+
+    The extracted Ex. 21 tables have an ID that doesn't match
+    the filenames in the GCS archive. Create a new column "filename"
+    that converts this ID column into the GCS archive filename
+    for that filing.
+    """
+    df.loc[:, "filename"] = (
+        "edgar/data/" + df[id_col_name].str.replace("-", "/", n=1) + ".txt"
+    )
+    return df
+
+
 class Exhibit21(BaseModel):
     """This is a class to wrap Exhibit 21's, which are included in many SEC 10ks."""
 
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py
new file mode 100644
index 0000000..d0266b9
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py
@@ -0,0 +1,76 @@
+"""Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies."""
+
+import pandas as pd
+
+
+# TODO: make Dagster inputs instead of reading from AWS?
+def get_eia861_utilities_table():
+    """Get the utilities contained in EIA Form 861.
+
+    TODO: In PUDL we should eventually implement an actual thorough
+    harvesting of utilities from all EIA Form 861 tables, but this is
+    good enough for now.
+    """
+    raw_eia861_df = pd.read_parquet(
+        "s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet"
+    )
+    harvested_df = pd.concat(
+        [
+            pd.read_parquet(
+                "s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet"
+            )[["report_date", "utility_id_eia", "utility_name_eia"]],
+            pd.read_parquet(
+                "s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet"
+            )[["report_date", "utility_id_eia", "utility_name_eia"]],
+            pd.read_parquet(
+                "s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet"
+            )[["report_date", "utility_id_eia", "utility_name_eia"]],
+            pd.read_parquet(
+                "s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet"
+            )[["report_date", "utility_id_eia", "utility_name_eia"]],
+        ]
+    )
+    eia861_df = raw_eia861_df.merge(
+        harvested_df, on=["report_date", "utility_id_eia"], how="left"
+    ).drop_duplicates(subset=["report_date", "utility_id_eia"])
+    mergers_df = pd.read_parquet(
+        "s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet"
+    )
+    mergers_df = mergers_df[mergers_df["new_parent"].notna()]
+    eia861_df = eia861_df.merge(
+        mergers_df[
+            ["report_date", "new_parent", "merge_address", "merge_city", "merge_state"]
+        ],
+        how="left",
+        left_on=["report_date", "utility_name_eia"],
+        right_on=["report_date", "new_parent"],
+    )
+    eia861_df = eia861_df.rename(
+        columns={"merge_address": "street_address", "merge_city": "city"}
+    )
+    eia861_df = (
+        eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index()
+    )
+
+    eia861_df["state"] = eia861_df["state"].where(
+        eia861_df["merge_state"].isnull(), eia861_df["merge_state"]
+    )
+    eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"])
+    return eia861_df
+
+
+# TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS?
+def get_eia_utilities_table():
+    """Create a table of EIA Form 860 and 861 utilities."""
+    raw_eia_df = pd.read_parquet(
+        "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet"
+    )
+    eia861_df = get_eia861_utilities_table()
+    eia_df = pd.concat([raw_eia_df, eia861_df])
+    eia_df = eia_df.drop_duplicates(
+        subset=["utility_id_eia", "report_date"], keep="first"
+    )
+    eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]")
+    # there are nulls from non harvested 861 utilities
+    eia_df = eia_df.dropna(subset="utility_name_eia")
+    return eia_df
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
index ebb7843..12c4704 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
@@ -1,5 +1,9 @@
 """Preprocessing for EIA and SEC input data before record linkage."""
 
+import re
+from importlib import resources
+from pathlib import Path
+
 import jellyfish
 import numpy as np
 import pandas as pd
@@ -60,82 +64,15 @@
     "",
 ]
 
-state_code_dict = {
-    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States.
-    "AK": "Alaska",
-    "AL": "Alabama",
-    "AR": "Arkansas",
-    "AZ": "Arizona",
-    "CA": "California",
-    "CO": "Colorado",
-    "CT": "Connecticut",
-    "DE": "Delaware",
-    "FL": "Florida",
-    "GA": "Georgia",
-    "HI": "Hawaii",
-    "IA": "Iowa",
-    "ID": "Idaho",
-    "IL": "Illinois",
-    "IN": "Indiana",
-    "KS": "Kansas",
-    "KY": "Kentucky",
-    "LA": "Louisiana",
-    "MA": "Massachusetts",
-    "MD": "Maryland",
-    "ME": "Maine",
-    "MI": "Michigan",
-    "MN": "Minnesota",
-    "MO": "Missouri",
-    "MS": "Mississippi",
-    "MT": "Montana",
-    "NC": "North Carolina",
-    "ND": "North Dakota",
-    "NE": "Nebraska",
-    "NH": "New Hampshire",
-    "NJ": "New Jersey",
-    "NM": "New Mexico",
-    "NV": "Nevada",
-    "NY": "New York",
-    "OH": "Ohio",
-    "OK": "Oklahoma",
-    "OR": "Oregon",
-    "PA": "Pennsylvania",
-    "RI": "Rhode Island",
-    "SC": "South Carolina",
-    "SD": "South Dakota",
-    "TN": "Tennessee",
-    "TX": "Texas",
-    "UT": "Utah",
-    "VA": "Virginia",
-    "VT": "Vermont",
-    "WA": "Washington",
-    "WI": "Wisconsin",
-    "WV": "West Virginia",
-    "WY": "Wyoming",
-    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district.
-    "DC": "District of Columbia",
-    # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories.
-    "AS": "American Samoa",
-    "GU": "Guam GU",
-    "MP": "Northern Mariana Islands",
-    "PR": "Puerto Rico PR",
-    "VI": "U.S. Virgin Islands",
-}
-state_code_to_name = {k.lower(): v.lower() for k, v in state_code_dict.items()}
 
 company_name_cleaner = name_cleaner.CompanyNameCleaner(
     cleaning_rules_list=[
         "remove_word_the_from_the_end",
         "remove_word_the_from_the_beginning",
-        "replace_amperstand_between_space_by_AND",
+        "replace_ampersand_by_AND",
         "replace_hyphen_by_space",
-        "replace_hyphen_between_spaces_by_single_space",
         "replace_underscore_by_space",
-        "replace_underscore_between_spaces_by_single_space",
-        # "remove_all_punctuation",
-        # "remove_numbers",
-        # "remove_math_symbols",
-        "remove_words_in_parentheses",
+        "remove_text_punctuation",
         "remove_parentheses",
         "remove_brackets",
         "remove_curly_brackets",
@@ -143,7 +80,38 @@
     ]
 )
 
+legal_term_remover = name_cleaner.CompanyNameCleaner(
+    cleaning_rules_list=[], handle_legal_terms=2
+)
+
+
+# TODO: remove
+def get_sec_state_code_dict():
+    """Create a dictionary mapping state codes to their names.
+
+    Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes
+    Published by SEC and reports valid state codes
+    for filers of Form D. Used to standardize the state codes
+    in the SEC 10K filings. The expanded names of the state codes
+    are comments in the XML file, so we have to read the XML in as
+    text and parse it.
+    """
+    # TODO: make a check to see if SEC has published a new version of this table
+    xml_filepath = (
+        resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml"
+    )
+    with Path.open(xml_filepath) as file:
+        xml_text = file.read()
+
+    pattern = r'<xs:enumeration value="(.*?)"/>.*?<!--\s*(.*?)\s*-->'
+    state_code_dict = {
+        code.lower(): name.lower()
+        for code, name in re.findall(pattern, xml_text, re.DOTALL)
+    }
+    return state_code_dict
 
+
+# TODO: moved to output table module, take out
 def _add_report_year_to_sec(sec_df):
     """Merge metadata on to get a report year for extracted SEC data.
 
@@ -151,9 +119,13 @@ def _add_report_year_to_sec(sec_df):
     """
     archive = GCSArchive()
     md = archive.get_metadata()
-    return sec_df.merge(
+    sec_df = sec_df.merge(
         md[["date_filed"]], how="left", left_index=True, right_index=True
     )
+    sec_df.loc[:, "report_year"] = (
+        sec_df["report_date"].astype("datetime64[ns]").dt.year
+    )
+    return sec_df
 
 
 # TODO: this is in PUDL, pull out into helper function
@@ -163,6 +135,7 @@ def _get_metaphone(row, col_name):
     return jellyfish.metaphone(row[col_name])
 
 
+# TODO: deduplicate this with what's already been done
 def _clean_company_name(df):
     df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
         df[["company_name"]]
@@ -171,9 +144,13 @@ def _clean_company_name(df):
     df = df.rename(columns={"company_name": "company_name_raw"}).rename(
         columns={"company_name_clean": "company_name"}
     )
+    df.loc[:, "company_name_no_legal"] = legal_term_remover.apply_name_cleaning(
+        df[["company_name"]]
+    )
     return df
 
 
+# TODO: deduplicate this with what's already been done
 def clean_sec_df(df):
     """Shared cleaning for SEC 10K and Ex. 21 dataframes.
 
@@ -185,29 +162,32 @@ def clean_sec_df(df):
         df[["company_name", "loc_of_incorporation"]]
         .fillna(pd.NA)
         .apply(lambda x: x.str.strip().str.lower())
+        .replace("", pd.NA)
     )
-    df.loc[:, "company_name"] = df["company_name"].replace("", pd.NA)
-    df.loc[:, "loc_of_incorporation"] = df["loc_of_incorporation"].replace("", pd.NA)
     df = _clean_company_name(df)
+    df.loc[:, "company_name_mphone"] = df.apply(
+        _get_metaphone, axis=1, args=("company_name_no_legal",)
+    )
     df = df[
         (~df["company_name"].isin(INVALID_NAMES))
-        & ~(df["company_name_raw"].isin(INVALID_NAMES))
+        & (~df["company_name_raw"].isin(INVALID_NAMES))
     ]
     df = df.fillna(np.nan)
-    df = df.drop_duplicates(
-        subset=["company_name", "loc_of_incorporation", "report_year"]
-    )
+
     return df
 
 
+# TODO: moved to output table module, take out
 def _remove_weird_sec_cols(sec_df):
-    for weird_col in ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]:
+    weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]
+    for weird_col in weird_cols:
         if weird_col not in sec_df:
             continue
         normal_col = weird_col[1:]
         sec_df.loc[:, normal_col] = sec_df[normal_col].where(
             sec_df[weird_col].isnull(), sec_df[weird_col]
         )
+        sec_df = sec_df.drop(columns=[weird_col])
     return sec_df
 
 
@@ -215,26 +195,35 @@ def _remove_weird_sec_cols(sec_df):
 # later unite them into one cleaning function
 def prepare_sec10k_basic_info_df(sec_df):
     """Preprocess SEC 10k basic information dataframe for record linkage."""
-    sec_df = _add_report_year_to_sec(sec_df)
+    # sec_df = _add_report_year_to_sec(sec_df)
     sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index()
-    sec_df.loc[:, "report_year"] = (
-        sec_df["report_date"].astype("datetime64[ns]").dt.year
-    )
-    sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
-        state_code_to_name
-    )
+    # state_code_to_name = get_sec_state_code_dict()
+    # sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
+    #     state_code_to_name
+    # )
     # TODO: maybe shouldn't expand the state names and comparison should
     # just be an exact match or nothing?
     # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name)
     # TODO: needs a record_id_sec column?
     # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"})
-    sec_df = _remove_weird_sec_cols(sec_df)
+    # sec_df = _remove_weird_sec_cols(sec_df)
     sec_df = clean_sec_df(sec_df)
     sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
-    sec_df.loc[:, "company_name_mphone"] = sec_df.apply(
-        _get_metaphone, axis=1, args=("company_name",)
+    # TODO: cluster/mark these duplicates so they can be assigned
+    # IDs post matching
+    sec_df = sec_df.drop_duplicates(
+        subset=[
+            "central_index_key",
+            "report_year",
+            "company_name",
+            "standard_industrial_classification",
+            "city",
+            "state",
+            "street_address",
+            "zip_code",
+        ]
     )
-    sec_df = sec_df.reset_index(names="record_id")
+    sec_df.loc[:, "sec_company_id"] = sec_df["central_index_key"]
     return sec_df
 
 
@@ -242,14 +231,20 @@ def prepare_ex21_df(ex21_df):
     """Preprocess Ex. 21 extracted dataframe for record linkage."""
     ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
     # TODO: move this to general preprocessing function?
+    state_code_to_name = get_sec_state_code_dict()
     ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace(
         state_code_to_name
     )
+    name_to_state_code = {v: k for k, v in state_code_to_name.items()}
+    # need this?
+    ex21_df.loc[:, "state_of_incorporation"] = ex21_df["loc_of_incorporation"].replace(
+        name_to_state_code
+    )
     ex21_df = clean_sec_df(ex21_df)
-    ex21_df.loc[:, "company_name_mphone"] = ex21_df.apply(
-        _get_metaphone, axis=1, args=("company_name",)
+    ex21_df = ex21_df.drop_duplicates(
+        subset=["company_name", "loc_of_incorporation", "report_year"]
     )
-    ex21_df = ex21_df.reset_index(names="record_id")
+    # ex21_df = ex21_df.reset_index(drop=True).reset_index(names="record_id")
     return ex21_df
 
 
@@ -263,26 +258,35 @@ def prepare_eia_df(eia_df):
     eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
     eia_df = _clean_company_name(eia_df)
     eia_df.loc[:, "company_name_mphone"] = eia_df.apply(
-        _get_metaphone, axis=1, args=("company_name",)
+        _get_metaphone, axis=1, args=("company_name_no_legal",)
     )
-    eia_df = eia_df.reset_index(names="record_id")
+    eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id")
     return eia_df
 
 
-"""
-def preprocessing(eia_df, sec_df):
-    # TODO: reorganize to be more similar to ferc to eia match structure
-    eia_df = eia_df.rename(columns=EIA_COL_MAP)
+def add_sec_company_id_to_subsidiaries(ex21_df: pd.DataFrame):
+    """Add sec_company_id onto SEC Ex. 21 subsidiaries.
 
-    # TODO: fill out this prepare for matching function
-    # eia_df = prepare_for_matching(eia_df)
-    # sec_df = prepare_for_matching(sec_df)
-    sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
-        state_code_to_name
+    At this point, the passed in Ex. 21 dataframe should have been
+    matched to SEC 10K filers with record linkage and assigned a CIK
+    where applicable (if the subsidiary files with the SEC). Take the
+    subsidiaries that don't have a CIK and create an sec_company_id
+    for those companies.
+
+    Arguments:
+        ex21_df: A dataframe of subsidiaries from SEC Ex. 21 filings with
+        columns subsidiary_cik, company_name (of the subsidiary),
+        and loc_of_incorporation.
+    """
+    ex21_df = ex21_df.sort_values(by="parent_cik")
+    ex21_df = ex21_df.drop_duplicates(subset=["company_name", "loc_of_incorporation"])
+    ex21_df.loc[:, "sec_company_id"] = (
+        ex21_df["parent_cik"]
+        + "_"
+        + (ex21_df.groupby("parent_cik").cumcount() + 1).astype(str)
     )
-    sec_df.loc[:, "loc_of_incorporation"] = sec_df["loc_of_incorporation"].where(
-        ~sec_df["loc_of_incorporation"].isnull(), sec_df["city"]
+    # override sec_company_id with CIK where a subsidiary has an assigned CIK
+    ex21_df.loc[:, "sec_company_id"] = ex21_df["sec_company_id"].where(
+        ex21_df["subsidiary_cik"].isnull(), ex21_df["subsidiary_cik"]
     )
-    sec_df = sec_df.rename(columns={"record_id_sec": "record_id"})
-    eia_df = eia_df.rename(columns={"record_id_eia": "record_id"})
-"""
+    return ex21_df
diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
new file mode 100644
index 0000000..d5b3c3d
--- /dev/null
+++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
@@ -0,0 +1,328 @@
+<?xml version="1.0"?>
+
+<!-- Filename: formDStateCodes.xsd.xml -->
+<!-- Purpose:  Define the set of EDGAR State and Country Codes specific to Form D. -->
+<!-- Version:  X301 -->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+           elementFormDefault="qualified">
+
+    <xs:simpleType name="STATE_COUNTRY_CODE">
+        <xs:annotation>
+            <xs:documentation>
+                Set of valid State and Country Codes according to EDGAR.
+            </xs:documentation>
+        </xs:annotation>
+		<xs:restriction base="xs:string">			
+			<xs:enumeration value="AL"/>  <!-- ALABAMA -->
+			<xs:enumeration value="AK"/>  <!-- ALASKA -->
+			<xs:enumeration value="AZ"/>  <!-- ARIZONA -->
+			<xs:enumeration value="AR"/>  <!-- ARKANSAS -->
+			<xs:enumeration value="CA"/>  <!-- CALIFORNIA -->
+			<xs:enumeration value="CO"/>  <!-- COLORADO -->
+			<xs:enumeration value="CT"/>  <!-- CONNECTICUT -->
+			<xs:enumeration value="DE"/>  <!-- DELAWARE -->
+			<xs:enumeration value="DC"/>  <!-- DISTRICT OF COLUMBIA -->
+			<xs:enumeration value="FL"/>  <!-- FLORIDA -->
+			<xs:enumeration value="GA"/>  <!-- GEORGIA -->
+			<xs:enumeration value="HI"/>  <!-- HAWAII -->
+			<xs:enumeration value="ID"/>  <!-- IDAHO -->
+			<xs:enumeration value="IL"/>  <!-- ILLINOIS -->
+			<xs:enumeration value="IN"/>  <!-- INDIANA -->
+			<xs:enumeration value="IA"/>  <!-- IOWA -->
+			<xs:enumeration value="KS"/>  <!-- KANSAS -->
+			<xs:enumeration value="KY"/>  <!-- KENTUCKY -->
+			<xs:enumeration value="LA"/>  <!-- LOUISIANA -->
+			<xs:enumeration value="ME"/>  <!-- MAINE -->
+			<xs:enumeration value="MD"/>  <!-- MARYLAND -->
+			<xs:enumeration value="MA"/>  <!-- MASSACHUSETTS -->
+			<xs:enumeration value="MI"/>  <!-- MICHIGAN -->
+			<xs:enumeration value="MN"/>  <!-- MINNESOTA -->
+			<xs:enumeration value="MS"/>  <!-- MISSISSIPPI -->
+			<xs:enumeration value="MO"/>  <!-- MISSOURI -->
+			<xs:enumeration value="MT"/>  <!-- MONTANA -->
+			<xs:enumeration value="NE"/>  <!-- NEBRASKA -->
+			<xs:enumeration value="NV"/>  <!-- NEVADA -->
+			<xs:enumeration value="NH"/>  <!-- NEW HAMPSHIRE -->
+			<xs:enumeration value="NJ"/>  <!-- NEW JERSEY -->
+			<xs:enumeration value="NM"/>  <!-- NEW MEXICO -->
+			<xs:enumeration value="NY"/>  <!-- NEW YORK -->
+			<xs:enumeration value="NC"/>  <!-- NORTH CAROLINA -->
+			<xs:enumeration value="ND"/>  <!-- NORTH DAKOTA -->
+			<xs:enumeration value="OH"/>  <!-- OHIO -->
+			<xs:enumeration value="OK"/>  <!-- OKLAHOMA -->
+			<xs:enumeration value="OR"/>  <!-- OREGON -->
+			<xs:enumeration value="PA"/>  <!-- PENNSYLVANIA -->
+			<xs:enumeration value="RI"/>  <!-- RHODE ISLAND -->
+			<xs:enumeration value="SC"/>  <!-- SOUTH CAROLINA -->
+			<xs:enumeration value="SD"/>  <!-- SOUTH DAKOTA -->
+			<xs:enumeration value="TN"/>  <!-- TENNESSEE -->
+			<xs:enumeration value="TX"/>  <!-- TEXAS -->
+			<xs:enumeration value="X1"/>  <!-- UNITED STATES -->
+			<xs:enumeration value="UT"/>  <!-- UTAH -->
+			<xs:enumeration value="VT"/>  <!-- VERMONT -->
+			<xs:enumeration value="VA"/>  <!-- VIRGINIA -->
+			<xs:enumeration value="WA"/>  <!-- WASHINGTON -->
+			<xs:enumeration value="WV"/>  <!-- WEST VIRGINIA -->
+			<xs:enumeration value="WI"/>  <!-- WISCONSIN -->
+			<xs:enumeration value="WY"/>  <!-- WYOMING -->
+			<xs:enumeration value="A0"/>  <!-- ALBERTA, CANADA -->
+			<xs:enumeration value="A1"/>  <!-- BRITISH COLUMBIA, CANADA -->
+			<xs:enumeration value="A2"/>  <!-- MANITOBA, CANADA -->
+			<xs:enumeration value="A3"/>  <!-- NEW BRUNSWICK, CANADA -->
+			<xs:enumeration value="A4"/>  <!-- NEWFOUNDLAND, CANADA -->
+			<xs:enumeration value="A5"/>  <!-- NOVA SCOTIA, CANADA -->
+			<xs:enumeration value="A6"/>  <!-- ONTARIO, CANADA -->
+			<xs:enumeration value="A7"/>  <!-- PRINCE EDWARD ISLAND, CANADA -->
+			<xs:enumeration value="A8"/>  <!-- QUEBEC, CANADA -->
+			<xs:enumeration value="A9"/>  <!-- SASKATCHEWAN, CANADA -->
+			<xs:enumeration value="B0"/>  <!-- YUKON, CANADA -->
+			<xs:enumeration value="Z4"/>  <!-- CANADA (FEDERAL LEVEL) -->
+			<xs:enumeration value="B2"/>  <!-- AFGHANISTAN -->
+			<xs:enumeration value="Y6"/>  <!-- ALAND ISLANDS -->
+			<xs:enumeration value="B3"/>  <!-- ALBANIA -->
+			<xs:enumeration value="B4"/>  <!-- ALGERIA -->
+			<xs:enumeration value="B5"/>  <!-- AMERICAN SAMOA -->
+			<xs:enumeration value="B6"/>  <!-- ANDORRA -->
+			<xs:enumeration value="B7"/>  <!-- ANGOLA -->
+			<xs:enumeration value="1A"/>  <!-- ANGUILLA -->
+			<xs:enumeration value="B8"/>  <!-- ANTARCTICA -->
+			<xs:enumeration value="B9"/>  <!-- ANTIGUA AND BARBUDA -->
+			<xs:enumeration value="C1"/>  <!-- ARGENTINA -->
+			<xs:enumeration value="1B"/>  <!-- ARMENIA -->
+			<xs:enumeration value="1C"/>  <!-- ARUBA -->
+			<xs:enumeration value="C3"/>  <!-- AUSTRALIA -->
+			<xs:enumeration value="C4"/>  <!-- AUSTRIA -->
+			<xs:enumeration value="1D"/>  <!-- AZERBAIJAN -->
+			<xs:enumeration value="C5"/>  <!-- BAHAMAS -->
+			<xs:enumeration value="C6"/>  <!-- BAHRAIN -->
+			<xs:enumeration value="C7"/>  <!-- BANGLADESH -->
+			<xs:enumeration value="C8"/>  <!-- BARBADOS -->
+			<xs:enumeration value="1F"/>  <!-- BELARUS -->
+			<xs:enumeration value="C9"/>  <!-- BELGIUM -->
+			<xs:enumeration value="D1"/>  <!-- BELIZE -->
+			<xs:enumeration value="G6"/>  <!-- BENIN -->
+			<xs:enumeration value="D0"/>  <!-- BERMUDA -->
+			<xs:enumeration value="D2"/>  <!-- BHUTAN -->
+			<xs:enumeration value="D3"/>  <!-- BOLIVIA -->
+			<xs:enumeration value="1E"/>  <!-- BOSNIA AND HERZEGOVINA -->
+			<xs:enumeration value="B1"/>  <!-- BOTSWANA -->
+			<xs:enumeration value="D4"/>  <!-- BOUVET ISLAND -->
+			<xs:enumeration value="D5"/>  <!-- BRAZIL -->
+			<xs:enumeration value="D6"/>  <!-- BRITISH INDIAN OCEAN TERRITORY -->
+			<xs:enumeration value="D9"/>  <!-- BRUNEI DARUSSALAM -->
+			<xs:enumeration value="E0"/>  <!-- BULGARIA -->
+			<xs:enumeration value="X2"/>  <!-- BURKINA FASO -->
+			<xs:enumeration value="E2"/>  <!-- BURUNDI -->
+			<xs:enumeration value="E3"/>  <!-- CAMBODIA -->
+			<xs:enumeration value="E4"/>  <!-- CAMEROON -->
+			<xs:enumeration value="E8"/>  <!-- CAPE VERDE -->
+			<xs:enumeration value="E9"/>  <!-- CAYMAN ISLANDS -->
+			<xs:enumeration value="F0"/>  <!-- CENTRAL AFRICAN REPUBLIC -->
+			<xs:enumeration value="F2"/>  <!-- CHAD -->
+			<xs:enumeration value="F3"/>  <!-- CHILE -->
+			<xs:enumeration value="F4"/>  <!-- CHINA -->
+			<xs:enumeration value="F6"/>  <!-- CHRISTMAS ISLAND -->
+			<xs:enumeration value="F7"/>  <!-- COCOS (KEELING) ISLANDS -->
+			<xs:enumeration value="F8"/>  <!-- COLOMBIA -->
+			<xs:enumeration value="F9"/>  <!-- COMOROS -->
+			<xs:enumeration value="G0"/>  <!-- CONGO -->
+			<xs:enumeration value="Y3"/>  <!-- CONGO, THE DEMOCRATIC REPUBLIC OF THE -->
+			<xs:enumeration value="G1"/>  <!-- COOK ISLANDS -->
+			<xs:enumeration value="G2"/>  <!-- COSTA RICA -->
+			<xs:enumeration value="L7"/>  <!-- COTE D'IVOIRE -->
+			<xs:enumeration value="1M"/>  <!-- CROATIA -->
+			<xs:enumeration value="G3"/>  <!-- CUBA -->
+			<xs:enumeration value="G4"/>  <!-- CYPRUS -->
+			<xs:enumeration value="2N"/>  <!-- CZECH REPUBLIC -->
+			<xs:enumeration value="G7"/>  <!-- DENMARK -->
+			<xs:enumeration value="1G"/>  <!-- DJIBOUTI -->
+			<xs:enumeration value="G9"/>  <!-- DOMINICA -->
+			<xs:enumeration value="G8"/>  <!-- DOMINICAN REPUBLIC -->
+			<xs:enumeration value="H1"/>  <!-- ECUADOR -->
+			<xs:enumeration value="H2"/>  <!-- EGYPT -->
+			<xs:enumeration value="H3"/>  <!-- EL SALVADOR -->
+			<xs:enumeration value="H4"/>  <!-- EQUATORIAL GUINEA -->
+			<xs:enumeration value="1J"/>  <!-- ERITREA -->
+			<xs:enumeration value="1H"/>  <!-- ESTONIA -->
+			<xs:enumeration value="H5"/>  <!-- ETHIOPIA -->
+			<xs:enumeration value="H7"/>  <!-- FALKLAND ISLANDS (MALVINAS) -->
+			<xs:enumeration value="H6"/>  <!-- FAROE ISLANDS -->
+			<xs:enumeration value="H8"/>  <!-- FIJI -->
+			<xs:enumeration value="H9"/>  <!-- FINLAND -->
+			<xs:enumeration value="I0"/>  <!-- FRANCE -->
+			<xs:enumeration value="I3"/>  <!-- FRENCH GUIANA -->
+			<xs:enumeration value="I4"/>  <!-- FRENCH POLYNESIA -->
+			<xs:enumeration value="2C"/>  <!-- FRENCH SOUTHERN TERRITORIES -->
+			<xs:enumeration value="I5"/>  <!-- GABON -->
+			<xs:enumeration value="I6"/>  <!-- GAMBIA -->
+			<xs:enumeration value="2Q"/>  <!-- GEORGIA -->
+			<xs:enumeration value="2M"/>  <!-- GERMANY -->
+			<xs:enumeration value="J0"/>  <!-- GHANA -->
+			<xs:enumeration value="J1"/>  <!-- GIBRALTAR -->
+			<xs:enumeration value="J3"/>  <!-- GREECE -->
+			<xs:enumeration value="J4"/>  <!-- GREENLAND -->
+			<xs:enumeration value="J5"/>  <!-- GRENADA -->
+			<xs:enumeration value="J6"/>  <!-- GUADELOUPE -->
+			<xs:enumeration value="GU"/>  <!-- GUAM -->
+			<xs:enumeration value="J8"/>  <!-- GUATEMALA -->
+			<xs:enumeration value="Y7"/>  <!-- GUERNSEY -->
+			<xs:enumeration value="J9"/>  <!-- GUINEA -->
+			<xs:enumeration value="S0"/>  <!-- GUINEA-BISSAU -->
+			<xs:enumeration value="K0"/>  <!-- GUYANA -->
+			<xs:enumeration value="K1"/>  <!-- HAITI -->
+			<xs:enumeration value="K4"/>  <!-- HEARD ISLAND AND MCDONALD ISLANDS -->
+			<xs:enumeration value="X4"/>  <!-- HOLY SEE (VATICAN CITY STATE) -->
+			<xs:enumeration value="K2"/>  <!-- HONDURAS -->
+			<xs:enumeration value="K3"/>  <!-- HONG KONG -->
+			<xs:enumeration value="K5"/>  <!-- HUNGARY -->
+			<xs:enumeration value="K6"/>  <!-- ICELAND -->
+			<xs:enumeration value="K7"/>  <!-- INDIA -->
+			<xs:enumeration value="K8"/>  <!-- INDONESIA -->
+			<xs:enumeration value="K9"/>  <!-- IRAN, ISLAMIC REPUBLIC OF -->
+			<xs:enumeration value="L0"/>  <!-- IRAQ -->
+			<xs:enumeration value="L2"/>  <!-- IRELAND -->
+			<xs:enumeration value="Y8"/>  <!-- ISLE OF MAN -->
+			<xs:enumeration value="L3"/>  <!-- ISRAEL -->
+			<xs:enumeration value="L6"/>  <!-- ITALY -->
+			<xs:enumeration value="L8"/>  <!-- JAMAICA -->
+			<xs:enumeration value="M0"/>  <!-- JAPAN -->
+			<xs:enumeration value="Y9"/>  <!-- JERSEY -->
+			<xs:enumeration value="M2"/>  <!-- JORDAN -->
+			<xs:enumeration value="1P"/>  <!-- KAZAKSTAN -->
+			<xs:enumeration value="M3"/>  <!-- KENYA -->
+			<xs:enumeration value="J2"/>  <!-- KIRIBATI -->
+			<xs:enumeration value="M4"/>  <!-- KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF -->
+			<xs:enumeration value="M5"/>  <!-- KOREA, REPUBLIC OF -->
+			<xs:enumeration value="M6"/>  <!-- KUWAIT -->
+			<xs:enumeration value="1N"/>  <!-- KYRGYZSTAN -->
+			<xs:enumeration value="M7"/>  <!-- LAO PEOPLE'S DEMOCRATIC REPUBLIC -->
+			<xs:enumeration value="1R"/>  <!-- LATVIA -->
+			<xs:enumeration value="M8"/>  <!-- LEBANON -->
+			<xs:enumeration value="M9"/>  <!-- LESOTHO -->
+			<xs:enumeration value="N0"/>  <!-- LIBERIA -->
+			<xs:enumeration value="N1"/>  <!-- LIBYAN ARAB JAMAHIRIYA -->
+			<xs:enumeration value="N2"/>  <!-- LIECHTENSTEIN -->
+			<xs:enumeration value="1Q"/>  <!-- LITHUANIA -->
+			<xs:enumeration value="N4"/>  <!-- LUXEMBOURG -->
+			<xs:enumeration value="N5"/>  <!-- MACAU -->
+			<xs:enumeration value="1U"/>  <!-- MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF -->
+			<xs:enumeration value="N6"/>  <!-- MADAGASCAR -->
+			<xs:enumeration value="N7"/>  <!-- MALAWI -->
+			<xs:enumeration value="N8"/>  <!-- MALAYSIA -->
+			<xs:enumeration value="N9"/>  <!-- MALDIVES -->
+			<xs:enumeration value="O0"/>  <!-- MALI -->
+			<xs:enumeration value="O1"/>  <!-- MALTA -->
+			<xs:enumeration value="1T"/>  <!-- MARSHALL ISLANDS -->
+			<xs:enumeration value="O2"/>  <!-- MARTINIQUE -->
+			<xs:enumeration value="O3"/>  <!-- MAURITANIA -->
+			<xs:enumeration value="O4"/>  <!-- MAURITIUS -->
+			<xs:enumeration value="2P"/>  <!-- MAYOTTE -->
+			<xs:enumeration value="O5"/>  <!-- MEXICO -->
+			<xs:enumeration value="1K"/>  <!-- MICRONESIA, FEDERATED STATES OF -->
+			<xs:enumeration value="1S"/>  <!-- MOLDOVA, REPUBLIC OF -->
+			<xs:enumeration value="O9"/>  <!-- MONACO -->
+			<xs:enumeration value="P0"/>  <!-- MONGOLIA -->
+			<xs:enumeration value="Z5"/>  <!-- MONTENEGRO -->
+			<xs:enumeration value="P1"/>  <!-- MONTSERRAT -->
+			<xs:enumeration value="P2"/>  <!-- MOROCCO -->
+			<xs:enumeration value="P3"/>  <!-- MOZAMBIQUE -->
+			<xs:enumeration value="E1"/>  <!-- MYANMAR -->
+			<xs:enumeration value="T6"/>  <!-- NAMIBIA -->
+			<xs:enumeration value="P5"/>  <!-- NAURU -->
+			<xs:enumeration value="P6"/>  <!-- NEPAL -->
+			<xs:enumeration value="P7"/>  <!-- NETHERLANDS -->
+			<xs:enumeration value="P8"/>  <!-- NETHERLANDS ANTILLES -->
+			<xs:enumeration value="1W"/>  <!-- NEW CALEDONIA -->
+			<xs:enumeration value="Q2"/>  <!-- NEW ZEALAND -->
+			<xs:enumeration value="Q3"/>  <!-- NICARAGUA -->
+			<xs:enumeration value="Q4"/>  <!-- NIGER -->
+			<xs:enumeration value="Q5"/>  <!-- NIGERIA -->
+			<xs:enumeration value="Q6"/>  <!-- NIUE -->
+			<xs:enumeration value="Q7"/>  <!-- NORFOLK ISLAND -->
+			<xs:enumeration value="1V"/>  <!-- NORTHERN MARIANA ISLANDS -->
+			<xs:enumeration value="Q8"/>  <!-- NORWAY -->
+			<xs:enumeration value="P4"/>  <!-- OMAN -->
+			<xs:enumeration value="R0"/>  <!-- PAKISTAN -->
+			<xs:enumeration value="1Y"/>  <!-- PALAU -->
+			<xs:enumeration value="1X"/>  <!-- PALESTINIAN TERRITORY, OCCUPIED -->
+			<xs:enumeration value="R1"/>  <!-- PANAMA -->
+			<xs:enumeration value="R2"/>  <!-- PAPUA NEW GUINEA -->
+			<xs:enumeration value="R4"/>  <!-- PARAGUAY -->
+			<xs:enumeration value="R5"/>  <!-- PERU -->
+			<xs:enumeration value="R6"/>  <!-- PHILIPPINES -->
+			<xs:enumeration value="R8"/>  <!-- PITCAIRN -->
+			<xs:enumeration value="R9"/>  <!-- POLAND -->
+			<xs:enumeration value="S1"/>  <!-- PORTUGAL -->
+			<xs:enumeration value="PR"/>  <!-- PUERTO RICO -->
+			<xs:enumeration value="S3"/>  <!-- QATAR -->
+			<xs:enumeration value="S4"/>  <!-- REUNION -->
+			<xs:enumeration value="S5"/>  <!-- ROMANIA -->
+			<xs:enumeration value="1Z"/>  <!-- RUSSIAN FEDERATION -->
+			<xs:enumeration value="S6"/>  <!-- RWANDA -->
+			<xs:enumeration value="Z0"/>  <!-- SAINT BARTHELEMY -->
+			<xs:enumeration value="U8"/>  <!-- SAINT HELENA -->
+			<xs:enumeration value="U7"/>  <!-- SAINT KITTS AND NEVIS -->
+			<xs:enumeration value="U9"/>  <!-- SAINT LUCIA -->
+			<xs:enumeration value="Z1"/>  <!-- SAINT MARTIN -->
+			<xs:enumeration value="V0"/>  <!-- SAINT PIERRE AND MIQUELON -->
+			<xs:enumeration value="V1"/>  <!-- SAINT VINCENT AND THE GRENADINES -->
+			<xs:enumeration value="Y0"/>  <!-- SAMOA -->
+			<xs:enumeration value="S8"/>  <!-- SAN MARINO -->
+			<xs:enumeration value="S9"/>  <!-- SAO TOME AND PRINCIPE -->
+			<xs:enumeration value="T0"/>  <!-- SAUDI ARABIA -->
+			<xs:enumeration value="T1"/>  <!-- SENEGAL -->
+			<xs:enumeration value="Z2"/>  <!-- SERBIA -->
+			<xs:enumeration value="T2"/>  <!-- SEYCHELLES -->
+			<xs:enumeration value="T8"/>  <!-- SIERRA LEONE -->
+			<xs:enumeration value="U0"/>  <!-- SINGAPORE -->
+			<xs:enumeration value="2B"/>  <!-- SLOVAKIA -->
+			<xs:enumeration value="2A"/>  <!-- SLOVENIA -->
+			<xs:enumeration value="D7"/>  <!-- SOLOMON ISLANDS -->
+			<xs:enumeration value="U1"/>  <!-- SOMALIA -->
+			<xs:enumeration value="T3"/>  <!-- SOUTH AFRICA -->
+			<xs:enumeration value="1L"/>  <!-- SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS -->
+			<xs:enumeration value="U3"/>  <!-- SPAIN -->
+			<xs:enumeration value="F1"/>  <!-- SRI LANKA -->
+			<xs:enumeration value="V2"/>  <!-- SUDAN -->
+			<xs:enumeration value="V3"/>  <!-- SURINAME -->
+			<xs:enumeration value="L9"/>  <!-- SVALBARD AND JAN MAYEN -->
+			<xs:enumeration value="V6"/>  <!-- SWAZILAND -->
+			<xs:enumeration value="V7"/>  <!-- SWEDEN -->
+			<xs:enumeration value="V8"/>  <!-- SWITZERLAND -->
+			<xs:enumeration value="V9"/>  <!-- SYRIAN ARAB REPUBLIC -->
+			<xs:enumeration value="F5"/>  <!-- TAIWAN, PROVINCE OF CHINA -->
+			<xs:enumeration value="2D"/>  <!-- TAJIKISTAN -->
+			<xs:enumeration value="W0"/>  <!-- TANZANIA, UNITED REPUBLIC OF -->
+			<xs:enumeration value="W1"/>  <!-- THAILAND -->
+			<xs:enumeration value="Z3"/>  <!-- TIMOR-LESTE -->
+			<xs:enumeration value="W2"/>  <!-- TOGO -->
+			<xs:enumeration value="W3"/>  <!-- TOKELAU -->
+			<xs:enumeration value="W4"/>  <!-- TONGA -->
+			<xs:enumeration value="W5"/>  <!-- TRINIDAD AND TOBAGO -->
+			<xs:enumeration value="W6"/>  <!-- TUNISIA -->
+			<xs:enumeration value="W8"/>  <!-- TURKEY -->
+			<xs:enumeration value="2E"/>  <!-- TURKMENISTAN -->
+			<xs:enumeration value="W7"/>  <!-- TURKS AND CAICOS ISLANDS -->
+			<xs:enumeration value="2G"/>  <!-- TUVALU -->
+			<xs:enumeration value="W9"/>  <!-- UGANDA -->
+			<xs:enumeration value="2H"/>  <!-- UKRAINE -->
+			<xs:enumeration value="C0"/>  <!-- UNITED ARAB EMIRATES -->
+			<xs:enumeration value="X0"/>  <!-- UNITED KINGDOM -->
+			<xs:enumeration value="2J"/>  <!-- UNITED STATES MINOR OUTLYING ISLANDS -->
+			<xs:enumeration value="X3"/>  <!-- URUGUAY -->
+			<xs:enumeration value="2K"/>  <!-- UZBEKISTAN -->
+			<xs:enumeration value="2L"/>  <!-- VANUATU -->
+			<xs:enumeration value="X5"/>  <!-- VENEZUELA -->
+			<xs:enumeration value="Q1"/>  <!-- VIET NAM -->
+			<xs:enumeration value="D8"/>  <!-- VIRGIN ISLANDS, BRITISH -->
+			<xs:enumeration value="VI"/>  <!-- VIRGIN ISLANDS, U.S. -->
+			<xs:enumeration value="X8"/>  <!-- WALLIS AND FUTUNA -->
+			<xs:enumeration value="U5"/>  <!-- WESTERN SAHARA -->
+			<xs:enumeration value="T7"/>  <!-- YEMEN -->
+			<xs:enumeration value="Y4"/>  <!-- ZAMBIA -->
+			<xs:enumeration value="Y5"/>  <!-- ZIMBABWE -->
+			<xs:enumeration value="XX"/>  <!-- UNKNOWN -->
+		</xs:restriction>
+    </xs:simpleType>
+</xs:schema>

From 88f17f2ed1f8c264d5b35f568d64701d074f800c Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Fri, 29 Nov 2024 15:18:57 -0500
Subject: [PATCH 133/161] fix errors with asset creation

---
 .../models/sec10k/sec_output_table.py         | 81 ++++++++++++-------
 1 file changed, 53 insertions(+), 28 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
index 6f0f900..a00cc99 100644
--- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
+++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
@@ -1,5 +1,7 @@
 """Module for creating an SEC 10K output table with filing companies and subsidiary companies."""
 
+import logging
+
 import re
 from importlib import resources
 from pathlib import Path
@@ -16,9 +18,16 @@
     company_name_cleaner,
 )
 
+from .extract import (
+    sec10k_filing_metadata,
+    year_quarter_partitions,
+)
+
+logger = logging.getLogger(f"catalystcoop.{__name__}")
+
 # TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters?
-archive = GCSArchive()
-md = archive.get_metadata()
+# archive = GCSArchive()
+# md = archive.get_metadata()
 
 INVALID_NAMES = [
     "llc",
@@ -50,21 +59,20 @@ def _remove_weird_sec_cols(sec_df) -> pd.DataFrame:
     return sec_df
 
 
-def _add_report_year_to_sec(sec_df) -> pd.DataFrame:
+def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
     """Merge metadata on to get a report year for extracted SEC data.
 
     Expects filename to be the index of the SEC dataframe.
     """
-    sec_df = sec_df.merge(
-        md[["date_filed"]], how="left", left_index=True, right_index=True
-    )
+    sec_df = sec_df.merge(md[["filename", "date_filed"]], how="left", on=["filename"])
+    sec_df = sec_df.rename(columns={"date_filed": "report_date"})
     sec_df.loc[:, "report_year"] = (
         sec_df["report_date"].astype("datetime64[ns]").dt.year
     )
     return sec_df
 
 
-def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame:
+def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame:
     """Keep only the most recent record for each unique SEC CIK.
 
     Note that this drops old records for companies that have changed
@@ -72,7 +80,6 @@ def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame:
     TODO: create an asset that tracks name and address chnages across
     time.
     """
-    sec_df = _add_report_year_to_sec(sec_df)
     sec_df = (
         sec_df.sort_values(by="report_year", ascending=False)
         .groupby("central_index_key")
@@ -113,14 +120,16 @@ def clean_loc_of_incorporation(df) -> pd.DataFrame:
         df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation
             column.
     """
-    state_code_to_name = get_sec_state_code_dict()
-    df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace(
-        state_code_to_name
-    )
+    if "state_of_incorporation" in df:
+        state_code_to_name = get_sec_state_code_dict()
+        df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace(
+            state_code_to_name
+        )
     df["loc_of_incorporation"] = (
         df["loc_of_incorporation"]
         .fillna(pd.NA)
-        .apply(lambda x: x.str.strip().str.lower())
+        .str.strip()
+        .str.lower()
         .replace("", pd.NA)
     )
     return df
@@ -134,10 +143,7 @@ def clean_company_name(df) -> pd.DataFrame:
             column.
     """
     df["company_name"] = (
-        df["company_name"]
-        .fillna(pd.NA)
-        .apply(lambda x: x.str.strip().str.lower())
-        .replace("", pd.NA)
+        df["company_name"].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA)
     )
     df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
         df[["company_name"]]
@@ -151,11 +157,12 @@ def clean_company_name(df) -> pd.DataFrame:
     return df
 
 
-def add_parent_company_cik(ex21_df: pd.DataFrame) -> pd.DataFrame:
+def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
     """Add the CIK of the parent company to Ex. 21 subsidiaries."""
-    ex21_df = ex21_df.merge(
-        md["cik"], how="left", left_on="filename", right_index=True
-    ).rename(columns={"cik": "parent_company_cik"})
+    ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename(
+        columns={"cik": "parent_company_cik"}
+    )
+    return ex21_df
 
 
 def match_ex21_subsidiaries_to_filer_company(
@@ -185,6 +192,9 @@ def match_ex21_subsidiaries_to_filer_company(
     merged_df = basic10k_df.merge(
         ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21")
     )
+    logger.info(f"basic 10k cols: {basic10k_df.columns}")
+    logger.info(f"ex21 cols: {ex21_df.columns}")
+    logger.info(f"merged cols: {merged_df.columns}")
     # split up the location of incorporation on whitespace, creating a column
     # with lists of word tokens
     merged_df.loc[:, "loc_tokens_sec"] = (
@@ -252,13 +262,20 @@ def match_ex21_subsidiaries_to_filer_company(
             io_manager_key="pandas_parquet_io_manager",
         )
     },
+    partitions_def=year_quarter_partitions,
 )
-def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame:
+def clean_ex21_table(
+    ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
+) -> pd.DataFrame:
     """Clean Ex. 21 table of subsidiaries before combing with basic 10k table."""
     ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
+    ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
+    ex21_df = ex21_df.rename(
+        columns={"subsidiary": "company_name", "loc": "loc_of_incorporation"}
+    )
     ex21_df = clean_loc_of_incorporation(ex21_df)
     ex21_df = clean_company_name(ex21_df)
-    ex21_df = add_parent_company_cik(ex21_df)
+    ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata)
     # flatten out the Ex. 21 table
     ex21_df = ex21_df.drop_duplicates(
         subset=["parent_company_cik", "company_name", "loc_of_incorporation"]
@@ -278,9 +295,12 @@ def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame:
             # specify a dagster_type?
         ),
     },
+    partitions_def=year_quarter_partitions,
 )
 def sec_output_table(
-    basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame
+    basic_10k_df: pd.DataFrame,
+    clean_ex21_df: pd.DataFrame,
+    sec10k_filing_metadata: pd.DataFrame,
 ) -> pd.DataFrame:
     """Asset for creating an SEC 10K output table.
 
@@ -293,10 +313,14 @@ def sec_output_table(
         values="value", index="filename", columns="key", aggfunc="first"
     )
     basic_10k_df.columns.name = None
+    basic_10k_df = basic_10k_df.reset_index()
     basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
-
+    basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata)
     # add a location of incorporation to better match it to Ex. 21 subsidiaries
     basic_10k_df = clean_loc_of_incorporation(basic_10k_df)
+    basic_10k_df = basic_10k_df.rename(
+        columns={"company_conformed_name": "company_name"}
+    )
     basic_10k_df = clean_company_name(basic_10k_df)
     ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
         basic10k_df=basic_10k_df, ex21_df=clean_ex21_df
@@ -314,8 +338,9 @@ def sec_output_table(
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
     # create a sec_company_id for the subsidiaries that don't have a CIK
     ex21_non_filing_subs_df.loc[:, "sec_company_id"] = (
-        ex21_non_filing_subs_df["company_name"].str
-        + ex21_non_filing_subs_df["loc_of_incorporation"].str
+        ex21_non_filing_subs_df["company_name"]
+        + "_"
+        + ex21_non_filing_subs_df["loc_of_incorporation"]
     )
     out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df])
     # this drops records for earlier company names and addresses
@@ -324,4 +349,4 @@ def sec_output_table(
     return out_df
 
 
-production_assets = [sec_output_table]
+production_assets = [sec_output_table, sec10k_filing_metadata]

From c9b62baa83f22f4dfcf7b3048e1009be5685d3f1 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sat, 30 Nov 2024 18:08:12 -0500
Subject: [PATCH 134/161] clean up sec output table creation

---
 .../20-kl-validate-sec-output-table.ipynb     | 1064 +++++++++++++++++
 .../models/sec10k/sec_output_table.py         |  105 +-
 2 files changed, 1124 insertions(+), 45 deletions(-)
 create mode 100644 notebooks/20-kl-validate-sec-output-table.ipynb

diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb
new file mode 100644
index 0000000..2b28fb9
--- /dev/null
+++ b/notebooks/20-kl-validate-sec-output-table.ipynb
@@ -0,0 +1,1064 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d383d1dd-6cdc-45ea-a371-105046c009e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3c58ad67-151d-4054-a972-a1e7ee12949f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from upath import UPath"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d178634-b494-4769-93e3-c0213e4a0326",
+   "metadata": {},
+   "source": [
+    "### Read in SEC output table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "25e8183d-3248-440c-aa4e-e7ee7db4c487",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# review outputs from Dagster\n",
+    "sec_out_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "3881bfbd-cdc3-4f9c-92af-9e74d7758e51",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sec_company_id</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>business_phone</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>city</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <th>film_number</th>\n",
+       "      <th>fiscal_year_end</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>...</th>\n",
+       "      <th>street_1</th>\n",
+       "      <th>street_2</th>\n",
+       "      <th>zip</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>location_of_inc</th>\n",
+       "      <th>company_name_clean</th>\n",
+       "      <th>parent_company_cik</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>files_10k</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>edgar/data/1800/0001628280-23-004026.txt</td>\n",
+       "      <td>2246676100</td>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>abbott park</td>\n",
+       "      <td>abbott laboratories</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23642562</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>100 abbott park road</td>\n",
+       "      <td>None</td>\n",
+       "      <td>60064-3500</td>\n",
+       "      <td>2023-02-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>illinois</td>\n",
+       "      <td>abbott laboratories</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0000001800_3a nutrition (vietnam) company limi...</td>\n",
+       "      <td>edgar/data/1800/0001628280-23-004026.txt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3a nutrition (vietnam) company limited</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2023-02-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>viet nam</td>\n",
+       "      <td>3a nutrition vietnam company limited</td>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>None</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0000001800_abbott (jiaxing) nutrition co., ltd...</td>\n",
+       "      <td>edgar/data/1800/0001628280-23-004026.txt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>abbott (jiaxing) nutrition co., ltd</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2023-02-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>china</td>\n",
+       "      <td>abbott jiaxing nutrition co limited</td>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>None</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0000001800_abbott (shanghai) diagnostics sales...</td>\n",
+       "      <td>edgar/data/1800/0001628280-23-004026.txt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>abbott (shanghai) diagnostics sales co., ltd</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2023-02-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>china</td>\n",
+       "      <td>abbott shanghai diagnostics sales co limited</td>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>None</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0000001800_abbott (uk) finance limited_united ...</td>\n",
+       "      <td>edgar/data/1800/0001628280-23-004026.txt</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>abbott (uk) finance limited</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2023-02-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>united kingdom</td>\n",
+       "      <td>abbott uk finance limited</td>\n",
+       "      <td>0000001800</td>\n",
+       "      <td>None</td>\n",
+       "      <td>False</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171358</th>\n",
+       "      <td>0001951118</td>\n",
+       "      <td>edgar/data/1951118/0001853620-23-000117.txt</td>\n",
+       "      <td>(248) 991-6700</td>\n",
+       "      <td>0001951118</td>\n",
+       "      <td>farmington hills</td>\n",
+       "      <td>mercedes-benz auto receivables trust 2022-1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23764946</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>35555 w. twelve mile rd.</td>\n",
+       "      <td>suite 100</td>\n",
+       "      <td>48331</td>\n",
+       "      <td>2023-03-27</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>mercedes benz auto receivables trust 2022 1</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171359</th>\n",
+       "      <td>0001951752</td>\n",
+       "      <td>edgar/data/1951752/0001951752-23-000016.txt</td>\n",
+       "      <td>3135943495</td>\n",
+       "      <td>0001951752</td>\n",
+       "      <td>dearborn</td>\n",
+       "      <td>ford credit auto owner trust 2022-d</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23751556</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>c/o ford motor co , whq ste 801-c1</td>\n",
+       "      <td>one american road</td>\n",
+       "      <td>48126</td>\n",
+       "      <td>2023-03-22</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>None</td>\n",
+       "      <td>ford credit auto owner trust 2022 d</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171360</th>\n",
+       "      <td>0001954336</td>\n",
+       "      <td>edgar/data/1477336/0001954336-23-000024.txt</td>\n",
+       "      <td>313-656-5500</td>\n",
+       "      <td>0001954336</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>ally auto receivables trust 2022-3</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23759320</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>1209 orange street</td>\n",
+       "      <td>None</td>\n",
+       "      <td>19801</td>\n",
+       "      <td>2023-03-24</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>ally auto receivables trust 2022 3</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171361</th>\n",
+       "      <td>0001954436</td>\n",
+       "      <td>edgar/data/1954436/0000929638-23-001050.txt</td>\n",
+       "      <td>(214) 572-8276</td>\n",
+       "      <td>0001954436</td>\n",
+       "      <td>irving</td>\n",
+       "      <td>exeter automobile receivables trust 2022-6</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23784761</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2101 w. john carpenter freeway</td>\n",
+       "      <td>None</td>\n",
+       "      <td>75063</td>\n",
+       "      <td>2023-03-31</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>exeter automobile receivables trust 2022 6</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>171362</th>\n",
+       "      <td>0001955010</td>\n",
+       "      <td>edgar/data/1955010/0001140361-23-012122.txt</td>\n",
+       "      <td>212-326-1500</td>\n",
+       "      <td>0001955010</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>oha senior private lending fund (u) llc</td>\n",
+       "      <td>None</td>\n",
+       "      <td>23740150</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>...</td>\n",
+       "      <td>one vanderbilt, 16th floor</td>\n",
+       "      <td>None</td>\n",
+       "      <td>10017</td>\n",
+       "      <td>2023-03-17</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>oha senior private lending fund u limited liab...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>171363 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                           sec_company_id  \\\n",
+       "0                                              0000001800   \n",
+       "1       0000001800_3a nutrition (vietnam) company limi...   \n",
+       "2       0000001800_abbott (jiaxing) nutrition co., ltd...   \n",
+       "3       0000001800_abbott (shanghai) diagnostics sales...   \n",
+       "4       0000001800_abbott (uk) finance limited_united ...   \n",
+       "...                                                   ...   \n",
+       "171358                                         0001951118   \n",
+       "171359                                         0001951752   \n",
+       "171360                                         0001954336   \n",
+       "171361                                         0001954436   \n",
+       "171362                                         0001955010   \n",
+       "\n",
+       "                                           filename  business_phone  \\\n",
+       "0          edgar/data/1800/0001628280-23-004026.txt      2246676100   \n",
+       "1          edgar/data/1800/0001628280-23-004026.txt            None   \n",
+       "2          edgar/data/1800/0001628280-23-004026.txt            None   \n",
+       "3          edgar/data/1800/0001628280-23-004026.txt            None   \n",
+       "4          edgar/data/1800/0001628280-23-004026.txt            None   \n",
+       "...                                             ...             ...   \n",
+       "171358  edgar/data/1951118/0001853620-23-000117.txt  (248) 991-6700   \n",
+       "171359  edgar/data/1951752/0001951752-23-000016.txt      3135943495   \n",
+       "171360  edgar/data/1477336/0001954336-23-000024.txt    313-656-5500   \n",
+       "171361  edgar/data/1954436/0000929638-23-001050.txt  (214) 572-8276   \n",
+       "171362  edgar/data/1955010/0001140361-23-012122.txt    212-326-1500   \n",
+       "\n",
+       "       central_index_key              city  \\\n",
+       "0             0000001800       abbott park   \n",
+       "1                   None              None   \n",
+       "2                   None              None   \n",
+       "3                   None              None   \n",
+       "4                   None              None   \n",
+       "...                  ...               ...   \n",
+       "171358        0001951118  farmington hills   \n",
+       "171359        0001951752          dearborn   \n",
+       "171360        0001954336        wilmington   \n",
+       "171361        0001954436            irving   \n",
+       "171362        0001955010          new york   \n",
+       "\n",
+       "                                        company_name date_of_name_change  \\\n",
+       "0                                abbott laboratories                None   \n",
+       "1             3a nutrition (vietnam) company limited                None   \n",
+       "2                abbott (jiaxing) nutrition co., ltd                None   \n",
+       "3       abbott (shanghai) diagnostics sales co., ltd                None   \n",
+       "4                        abbott (uk) finance limited                None   \n",
+       "...                                              ...                 ...   \n",
+       "171358   mercedes-benz auto receivables trust 2022-1                None   \n",
+       "171359           ford credit auto owner trust 2022-d                None   \n",
+       "171360            ally auto receivables trust 2022-3                None   \n",
+       "171361    exeter automobile receivables trust 2022-6                None   \n",
+       "171362       oha senior private lending fund (u) llc                None   \n",
+       "\n",
+       "       film_number fiscal_year_end form_type  ...  \\\n",
+       "0         23642562            1231      10-k  ...   \n",
+       "1             None            None      None  ...   \n",
+       "2             None            None      None  ...   \n",
+       "3             None            None      None  ...   \n",
+       "4             None            None      None  ...   \n",
+       "...            ...             ...       ...  ...   \n",
+       "171358    23764946            1231      10-k  ...   \n",
+       "171359    23751556            1231      10-k  ...   \n",
+       "171360    23759320            1231      10-k  ...   \n",
+       "171361    23784761            1231      10-k  ...   \n",
+       "171362    23740150            1231      10-k  ...   \n",
+       "\n",
+       "                                  street_1           street_2         zip  \\\n",
+       "0                     100 abbott park road               None  60064-3500   \n",
+       "1                                     None               None        None   \n",
+       "2                                     None               None        None   \n",
+       "3                                     None               None        None   \n",
+       "4                                     None               None        None   \n",
+       "...                                    ...                ...         ...   \n",
+       "171358            35555 w. twelve mile rd.          suite 100       48331   \n",
+       "171359  c/o ford motor co , whq ste 801-c1  one american road       48126   \n",
+       "171360                  1209 orange street               None       19801   \n",
+       "171361      2101 w. john carpenter freeway               None       75063   \n",
+       "171362          one vanderbilt, 16th floor               None       10017   \n",
+       "\n",
+       "       report_date report_year location_of_inc  \\\n",
+       "0       2023-02-17        2023        illinois   \n",
+       "1       2023-02-17        2023        viet nam   \n",
+       "2       2023-02-17        2023           china   \n",
+       "3       2023-02-17        2023           china   \n",
+       "4       2023-02-17        2023  united kingdom   \n",
+       "...            ...         ...             ...   \n",
+       "171358  2023-03-27        2023        delaware   \n",
+       "171359  2023-03-22        2023            None   \n",
+       "171360  2023-03-24        2023        delaware   \n",
+       "171361  2023-03-31        2023        delaware   \n",
+       "171362  2023-03-17        2023        delaware   \n",
+       "\n",
+       "                                       company_name_clean parent_company_cik  \\\n",
+       "0                                     abbott laboratories               None   \n",
+       "1                    3a nutrition vietnam company limited         0000001800   \n",
+       "2                     abbott jiaxing nutrition co limited         0000001800   \n",
+       "3            abbott shanghai diagnostics sales co limited         0000001800   \n",
+       "4                               abbott uk finance limited         0000001800   \n",
+       "...                                                   ...                ...   \n",
+       "171358        mercedes benz auto receivables trust 2022 1               None   \n",
+       "171359                ford credit auto owner trust 2022 d               None   \n",
+       "171360                 ally auto receivables trust 2022 3               None   \n",
+       "171361         exeter automobile receivables trust 2022 6               None   \n",
+       "171362  oha senior private lending fund u limited liab...               None   \n",
+       "\n",
+       "       own_per files_10k  \n",
+       "0         None      True  \n",
+       "1         None     False  \n",
+       "2         None     False  \n",
+       "3         None     False  \n",
+       "4         None     False  \n",
+       "...        ...       ...  \n",
+       "171358    None      True  \n",
+       "171359    None      True  \n",
+       "171360    None      True  \n",
+       "171361    None      True  \n",
+       "171362    None      True  \n",
+       "\n",
+       "[171363 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3447dcdb-4506-4de0-9201-9711ff9259ee",
+   "metadata": {},
+   "source": [
+    "### There are a combination of SEC 10K filers and subsidiary companies:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "0d654dfc-2fb2-41d3-9ff8-6fe70732a04a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "files_10k\n",
+       "False    165824\n",
+       "True       5539\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df.files_10k.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6797b5b7-be91-430a-a30c-cc26c62aa7b1",
+   "metadata": {},
+   "source": [
+    "### `sec_company_id` and `central_index_key` should be unique:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "67e0e789-feb0-4866-ba82-8346c62c1bef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df.sec_company_id.is_unique"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "053d65c9-dbdd-4622-a4ee-badc7db2a88d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df.central_index_key.dropna().is_unique"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7e05e03-fa05-4655-a085-c66afcfba442",
+   "metadata": {},
+   "source": [
+    "### Location of incorporation should be clean and standardized for filers and subsidiaries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "cb33b703-be24-4ddc-a9f2-148850c3f4af",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "location_of_inc\n",
+       "delaware          3076\n",
+       "nevada             300\n",
+       "maryland           299\n",
+       "cayman islands     135\n",
+       "north carolina      92\n",
+       "new york            74\n",
+       "florida             74\n",
+       "pennsylvania        71\n",
+       "california          57\n",
+       "texas               56\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "cb6fc7b5-b9c0-46ae-991c-cae41f86e8f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "location_of_inc\n",
+       "bahamas                  1\n",
+       "germany                  1\n",
+       "hong kong                1\n",
+       "china                    1\n",
+       "virgin islands, u.s.     1\n",
+       "quebec, canada           1\n",
+       "new brunswick, canada    1\n",
+       "new hampshire            1\n",
+       "netherlands antilles     1\n",
+       "malaysia                 1\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "5373ced2-75e9-4229-b927-3ad4b8d33e39",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "location_of_inc\n",
+       "delaware          67546\n",
+       "united kingdom     4979\n",
+       "cayman islands     3000\n",
+       "texas              2881\n",
+       "netherlands        2615\n",
+       "california         2566\n",
+       "germany            2381\n",
+       "china              2305\n",
+       "florida            2130\n",
+       "australia          1938\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "3ceb1aa2-c622-4a97-9293-281325637f09",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "location_of_inc\n",
+       "ontario, can                 1\n",
+       "british col, can             1\n",
+       "hong kong china china        1\n",
+       "zhongshan, china             1\n",
+       "jacksonville, florida        1\n",
+       "toronto, ontario, canada     1\n",
+       "limassol, cyprus             1\n",
+       "doncaster, syorkshire, uk    1\n",
+       "manchester, england          1\n",
+       "cote                         1\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "95d51bdb-c378-45bc-9848-4a2a8895b470",
+   "metadata": {},
+   "source": [
+    "### All non SEC 10K filers should have a `parent_company_cik`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "89cd6bdb-a06c-40ae-8b49-c610e769f9c8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "parent_company_cik\n",
+       "False    165824\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[~sec_out_df.files_10k][\"parent_company_cik\"].isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f4bd494-951f-417f-ba56-fa0202d741a5",
+   "metadata": {},
+   "source": [
+    "### When run on all year quarters, all `parent_company_cik` should appear in `central_index_key` column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "d024bc29-d0b1-45cd-a0a2-c9b66e73e0d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2954"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n_parent_company_cik = len(set(sec_out_df.parent_company_cik))\n",
+    "n_parent_company_cik"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "d70660f2-559e-4ec1-8167-1bfdce45c287",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2832"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n_overlap = len(set(sec_out_df.parent_company_cik).intersection(set(sec_out_df.central_index_key)))\n",
+    "n_overlap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "0eb86d64-5ca0-423a-864c-dbfb00b5b9fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "122"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "n_parent_company_cik - n_overlap"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60366af2-259a-4a87-a93f-2180d8777c67",
+   "metadata": {},
+   "source": [
+    "### There should be filer companies that have a `parent_company_cik` because they were matched to a subsidiary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "b5c53dab-3be5-48f1-90f6-583acfb452ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "parent_company_cik\n",
+       "True     5474\n",
+       "False      65\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[sec_out_df.files_10k].parent_company_cik.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5358a4e1-38a7-489d-bf1a-f53de58447ba",
+   "metadata": {},
+   "source": [
+    "### There should be no non-filer companies that have a CIK"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "4a19df26-79c3-4aa1-bcbf-916b822346ca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "central_index_key\n",
+       "True    165824\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[~sec_out_df.files_10k].central_index_key.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bde4f03f-e5b0-4895-ade6-ae44b260e78e",
+   "metadata": {},
+   "source": [
+    "### There should be no duplicated `company_name`, `location_of_inc`, `parent_company_cik` records"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "ca87709a-daa7-4396-83a4-0f5bb8ec2cd4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sec_company_id</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>business_phone</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>city</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <th>film_number</th>\n",
+       "      <th>fiscal_year_end</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>...</th>\n",
+       "      <th>street_1</th>\n",
+       "      <th>street_2</th>\n",
+       "      <th>zip</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>location_of_inc</th>\n",
+       "      <th>company_name_clean</th>\n",
+       "      <th>parent_company_cik</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>files_10k</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>0 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [sec_company_id, filename, business_phone, central_index_key, city, company_name, date_of_name_change, film_number, fiscal_year_end, form_type, former_conformed_name, irs_number, sec_act, sec_file_number, standard_industrial_classification, state, state_of_incorporation, street_1, street_2, zip, report_date, report_year, location_of_inc, company_name_clean, parent_company_cik, own_per, files_10k]\n",
+       "Index: []\n",
+       "\n",
+       "[0 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[sec_out_df.duplicated(subset=[\"company_name\", \"location_of_inc\", \"parent_company_cik\"])]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bca9e395-bd96-4183-b299-46cd589d97d5",
+   "metadata": {},
+   "source": [
+    "### There can be companies with the same name, location, and CIK, but different parent companies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "cc1880f3-a9d3-4f8a-a42b-2f9ff428ca45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_out_df = sec_out_df.fillna({\"central_index_key\": pd.NA})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "f87257df-00f7-48a8-882a-fb1ea8c27e18",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>location_of_inc</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>parent_company_cik</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Empty DataFrame\n",
+       "Columns: [company_name, location_of_inc, central_index_key, parent_company_cik]\n",
+       "Index: []"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sec_out_df[~sec_out_df.central_index_key.isnull() \n",
+    "           & (sec_out_df.duplicated(\n",
+    "               subset=[\"company_name\", \"location_of_inc\", \"central_index_key\"], keep=False\n",
+    "             ))][[\"company_name\", \"location_of_inc\", \"central_index_key\", \"parent_company_cik\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2169181-dcd8-4b43-b03e-9526f597147d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mozilla_sec_eia",
+   "language": "python",
+   "name": "mozilla_sec_eia"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
index a00cc99..1ccaea9 100644
--- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
+++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
@@ -1,7 +1,6 @@
 """Module for creating an SEC 10K output table with filing companies and subsidiary companies."""
 
 import logging
-
 import re
 from importlib import resources
 from pathlib import Path
@@ -11,7 +10,6 @@
 from dagster import AssetIn, AssetOut, multi_asset
 
 from mozilla_sec_eia.models.sec10k.utils.cloud import (
-    GCSArchive,
     convert_ex21_id_to_filename,
 )
 from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import (
@@ -25,10 +23,6 @@
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
-# TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters?
-# archive = GCSArchive()
-# md = archive.get_metadata()
-
 INVALID_NAMES = [
     "llc",
     "limited liability company",
@@ -73,18 +67,20 @@ def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr
 
 
 def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame:
-    """Keep only the most recent record for each unique SEC CIK.
+    """Keep only the most recent record for each unique SEC company.
 
     Note that this drops old records for companies that have changed
-    names or addresses across time.
+    names or addresses across time. Also, we group by sec_company_id not
+    CIK, so filer companies and subsidiary companies are unique in the
+    output dataframe.
     TODO: create an asset that tracks name and address chnages across
     time.
     """
     sec_df = (
         sec_df.sort_values(by="report_year", ascending=False)
-        .groupby("central_index_key")
+        .groupby("sec_company_id")
         .first()
-    )
+    ).reset_index()
     return sec_df
 
 
@@ -113,20 +109,19 @@ def get_sec_state_code_dict() -> dict[str, str]:
     return state_code_dict
 
 
-def clean_loc_of_incorporation(df) -> pd.DataFrame:
+def clean_location_of_inc(df) -> pd.DataFrame:
     """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe.
 
     Arguments:
-        df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation
+        df: Ex. 21 or SEC 10K basic info dataframe with location_of_inc
             column.
     """
     if "state_of_incorporation" in df:
-        state_code_to_name = get_sec_state_code_dict()
-        df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace(
-            state_code_to_name
-        )
-    df["loc_of_incorporation"] = (
-        df["loc_of_incorporation"]
+        df.loc[:, "location_of_inc"] = df["state_of_incorporation"]
+    state_code_to_name = get_sec_state_code_dict()
+    df.loc[:, "location_of_inc"] = (
+        df["location_of_inc"]
+        .replace(state_code_to_name)
         .fillna(pd.NA)
         .str.strip()
         .str.lower()
@@ -162,6 +157,9 @@ def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr
     ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename(
         columns={"cik": "parent_company_cik"}
     )
+    ex21_df.loc[:, "parent_company_cik"] = (
+        ex21_df["parent_company_cik"].astype(str).str.zfill(10)
+    )
     return ex21_df
 
 
@@ -185,23 +183,20 @@ def match_ex21_subsidiaries_to_filer_company(
         subset=[
             "central_index_key",
             "company_name",
-            "loc_of_incorporation",
+            "location_of_inc",
             "report_year",
         ]
     )
     merged_df = basic10k_df.merge(
         ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21")
     )
-    logger.info(f"basic 10k cols: {basic10k_df.columns}")
-    logger.info(f"ex21 cols: {ex21_df.columns}")
-    logger.info(f"merged cols: {merged_df.columns}")
     # split up the location of incorporation on whitespace, creating a column
     # with lists of word tokens
     merged_df.loc[:, "loc_tokens_sec"] = (
-        merged_df["loc_of_incorporation_sec"].fillna("").str.lower().str.split()
+        merged_df["location_of_inc_sec"].fillna("").str.lower().str.split()
     )
     merged_df.loc[:, "loc_tokens_ex21"] = (
-        merged_df["loc_of_incorporation_ex21"].fillna("").str.lower().str.split()
+        merged_df["location_of_inc_ex21"].fillna("").str.lower().str.split()
     )
     # get the number of words overlapping between location of incorporation tokens
     merged_df["loc_overlap"] = merged_df.apply(
@@ -216,23 +211,28 @@ def match_ex21_subsidiaries_to_filer_company(
     merged_df = merged_df.sort_values(
         by=[
             "company_name",
-            "loc_of_incorporation_ex21",
+            "location_of_inc_ex21",
             "loc_overlap",
             "report_year_diff",
         ],
         ascending=[True, True, False, True],
     )
     # Select the row with the highest loc overlap and nearest report years
-    # for each company name and location pair
+    # for each company name, location, and parent company record
     closest_match_df = merged_df.groupby(
-        ["company_name", "loc_of_incorporation_ex21"], as_index=False
+        ["company_name", "location_of_inc_ex21", "parent_company_cik"], as_index=False
     ).first()
     ex21_with_cik_df = ex21_df.merge(
         closest_match_df[
-            ["company_name", "central_index_key", "loc_of_incorporation_ex21"]
-        ].rename(columns={"loc_of_incorporation_ex21": "loc_of_incorporation"}),
+            [
+                "company_name",
+                "parent_company_cik",
+                "location_of_inc_ex21",
+                "central_index_key",
+            ]
+        ].rename(columns={"location_of_inc_ex21": "location_of_inc"}),
         how="left",
-        on=["company_name", "loc_of_incorporation"],
+        on=["company_name", "location_of_inc", "parent_company_cik"],
     ).rename(columns={"central_index_key": "subsidiary_cik"})
     # if a subsidiary doesn't have a CIK and has a null location
     # but its company name was assigned a CIK (with a different location)
@@ -244,15 +244,33 @@ def match_ex21_subsidiaries_to_filer_company(
     ).rename(columns={"central_index_key": "company_name_merge_cik"})
     ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where(
         ~(ex21_with_cik_df.subsidiary_cik.isnull())
-        | ~(ex21_with_cik_df.loc_of_incorporation.isnull()),
+        | ~(ex21_with_cik_df.location_of_inc.isnull()),
         ex21_with_cik_df["company_name_merge_cik"],
     )
+    ex21_with_cik_df = ex21_with_cik_df.drop(columns="company_name_merge_cik")
     ex21_with_cik_df = ex21_with_cik_df.rename(
         columns={"subsidiary_cik": "central_index_key"}
     )
     return ex21_with_cik_df
 
 
+def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame:
+    """Create an sec_company_id for Ex. 21 subsidiaries.
+
+    This is a unique identifier string for Ex. 21 subsidiaries.
+    This ID is necessary for tracking subsidiaries who aren't ultimately
+    matched to a 10K filer company.
+    """
+    ex21_df.loc[:, "sec_company_id"] = (
+        ex21_df["parent_company_cik"]
+        + "_"
+        + ex21_df["company_name"]
+        + "_"
+        + ex21_df["location_of_inc"]
+    )
+    return ex21_df
+
+
 @multi_asset(
     ins={
         "ex21_df": AssetIn("ex21_company_ownership_info"),
@@ -267,19 +285,21 @@ def match_ex21_subsidiaries_to_filer_company(
 def clean_ex21_table(
     ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
 ) -> pd.DataFrame:
-    """Clean Ex. 21 table of subsidiaries before combing with basic 10k table."""
+    """Clean Ex. 21 table of subsidiaries before combining with basic 10k table."""
     ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
+    ex21_df = ex21_df.drop(columns=["id"])
     ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
     ex21_df = ex21_df.rename(
-        columns={"subsidiary": "company_name", "loc": "loc_of_incorporation"}
+        columns={"subsidiary": "company_name", "loc": "location_of_inc"}
     )
-    ex21_df = clean_loc_of_incorporation(ex21_df)
+    ex21_df = clean_location_of_inc(ex21_df)
     ex21_df = clean_company_name(ex21_df)
     ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata)
-    # flatten out the Ex. 21 table
-    ex21_df = ex21_df.drop_duplicates(
-        subset=["parent_company_cik", "company_name", "loc_of_incorporation"]
-    )
+    # add an sec_company_id, ultimately this ID become the subsidiary's CIK
+    # if the subsidiary is matched to an SEC filer
+    ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df)
+    ex21_df = _flatten_sec_companies_across_time(ex21_df)
+
     return ex21_df
 
 
@@ -317,7 +337,7 @@ def sec_output_table(
     basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
     basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata)
     # add a location of incorporation to better match it to Ex. 21 subsidiaries
-    basic_10k_df = clean_loc_of_incorporation(basic_10k_df)
+    basic_10k_df = clean_location_of_inc(basic_10k_df)
     basic_10k_df = basic_10k_df.rename(
         columns={"company_conformed_name": "company_name"}
     )
@@ -332,16 +352,11 @@ def sec_output_table(
     )
     basic_10k_df.loc[:, "files_10k"] = True
     basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"]
+    # get the subsidiary companies that weren't matched to a 10K filing company
     ex21_non_filing_subs_df = ex21_df_with_cik[
         ex21_df_with_cik["central_index_key"].isnull()
     ]
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
-    # create a sec_company_id for the subsidiaries that don't have a CIK
-    ex21_non_filing_subs_df.loc[:, "sec_company_id"] = (
-        ex21_non_filing_subs_df["company_name"]
-        + "_"
-        + ex21_non_filing_subs_df["loc_of_incorporation"]
-    )
     out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df])
     # this drops records for earlier company names and addresses
     # that have since changed, so we lose some information

From 01b2d23084063dd7572bf009adba8aa822d50c7f Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sat, 30 Nov 2024 18:08:39 -0500
Subject: [PATCH 135/161] splink notebook change

---
 notebooks/18-kl-splink-sec-eia.ipynb | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 0d74c13..19ab082 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -527,6 +527,32 @@
     "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "56f41505-421e-4bf7-bfc4-93500e0c5e71",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    a_1\n",
+       "1    b_2\n",
+       "2    c_3\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df1 = pd.DataFrame({\"text1\": [\"a\", \"b\", \"c\"]})\n",
+    "df2 = pd.DataFrame({\"text2\": [\"1\", \"2\", \"3\"]})\n",
+    "df1[\"text1\"] + \"_\" + df2[\"text2\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",

From 30d22c96bdff9d893887295479e8c880fe955e1d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 30 Nov 2024 23:16:10 +0000
Subject: [PATCH 136/161] [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci
---
 .../package_data/formDStateCodes.xsd.xml      | 656 +++++++++---------
 1 file changed, 328 insertions(+), 328 deletions(-)

diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
index d5b3c3d..2ec0c2b 100644
--- a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
+++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml
@@ -1,328 +1,328 @@
-<?xml version="1.0"?>
-
-<!-- Filename: formDStateCodes.xsd.xml -->
-<!-- Purpose:  Define the set of EDGAR State and Country Codes specific to Form D. -->
-<!-- Version:  X301 -->
-
-<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
-           elementFormDefault="qualified">
-
-    <xs:simpleType name="STATE_COUNTRY_CODE">
-        <xs:annotation>
-            <xs:documentation>
-                Set of valid State and Country Codes according to EDGAR.
-            </xs:documentation>
-        </xs:annotation>
-		<xs:restriction base="xs:string">			
-			<xs:enumeration value="AL"/>  <!-- ALABAMA -->
-			<xs:enumeration value="AK"/>  <!-- ALASKA -->
-			<xs:enumeration value="AZ"/>  <!-- ARIZONA -->
-			<xs:enumeration value="AR"/>  <!-- ARKANSAS -->
-			<xs:enumeration value="CA"/>  <!-- CALIFORNIA -->
-			<xs:enumeration value="CO"/>  <!-- COLORADO -->
-			<xs:enumeration value="CT"/>  <!-- CONNECTICUT -->
-			<xs:enumeration value="DE"/>  <!-- DELAWARE -->
-			<xs:enumeration value="DC"/>  <!-- DISTRICT OF COLUMBIA -->
-			<xs:enumeration value="FL"/>  <!-- FLORIDA -->
-			<xs:enumeration value="GA"/>  <!-- GEORGIA -->
-			<xs:enumeration value="HI"/>  <!-- HAWAII -->
-			<xs:enumeration value="ID"/>  <!-- IDAHO -->
-			<xs:enumeration value="IL"/>  <!-- ILLINOIS -->
-			<xs:enumeration value="IN"/>  <!-- INDIANA -->
-			<xs:enumeration value="IA"/>  <!-- IOWA -->
-			<xs:enumeration value="KS"/>  <!-- KANSAS -->
-			<xs:enumeration value="KY"/>  <!-- KENTUCKY -->
-			<xs:enumeration value="LA"/>  <!-- LOUISIANA -->
-			<xs:enumeration value="ME"/>  <!-- MAINE -->
-			<xs:enumeration value="MD"/>  <!-- MARYLAND -->
-			<xs:enumeration value="MA"/>  <!-- MASSACHUSETTS -->
-			<xs:enumeration value="MI"/>  <!-- MICHIGAN -->
-			<xs:enumeration value="MN"/>  <!-- MINNESOTA -->
-			<xs:enumeration value="MS"/>  <!-- MISSISSIPPI -->
-			<xs:enumeration value="MO"/>  <!-- MISSOURI -->
-			<xs:enumeration value="MT"/>  <!-- MONTANA -->
-			<xs:enumeration value="NE"/>  <!-- NEBRASKA -->
-			<xs:enumeration value="NV"/>  <!-- NEVADA -->
-			<xs:enumeration value="NH"/>  <!-- NEW HAMPSHIRE -->
-			<xs:enumeration value="NJ"/>  <!-- NEW JERSEY -->
-			<xs:enumeration value="NM"/>  <!-- NEW MEXICO -->
-			<xs:enumeration value="NY"/>  <!-- NEW YORK -->
-			<xs:enumeration value="NC"/>  <!-- NORTH CAROLINA -->
-			<xs:enumeration value="ND"/>  <!-- NORTH DAKOTA -->
-			<xs:enumeration value="OH"/>  <!-- OHIO -->
-			<xs:enumeration value="OK"/>  <!-- OKLAHOMA -->
-			<xs:enumeration value="OR"/>  <!-- OREGON -->
-			<xs:enumeration value="PA"/>  <!-- PENNSYLVANIA -->
-			<xs:enumeration value="RI"/>  <!-- RHODE ISLAND -->
-			<xs:enumeration value="SC"/>  <!-- SOUTH CAROLINA -->
-			<xs:enumeration value="SD"/>  <!-- SOUTH DAKOTA -->
-			<xs:enumeration value="TN"/>  <!-- TENNESSEE -->
-			<xs:enumeration value="TX"/>  <!-- TEXAS -->
-			<xs:enumeration value="X1"/>  <!-- UNITED STATES -->
-			<xs:enumeration value="UT"/>  <!-- UTAH -->
-			<xs:enumeration value="VT"/>  <!-- VERMONT -->
-			<xs:enumeration value="VA"/>  <!-- VIRGINIA -->
-			<xs:enumeration value="WA"/>  <!-- WASHINGTON -->
-			<xs:enumeration value="WV"/>  <!-- WEST VIRGINIA -->
-			<xs:enumeration value="WI"/>  <!-- WISCONSIN -->
-			<xs:enumeration value="WY"/>  <!-- WYOMING -->
-			<xs:enumeration value="A0"/>  <!-- ALBERTA, CANADA -->
-			<xs:enumeration value="A1"/>  <!-- BRITISH COLUMBIA, CANADA -->
-			<xs:enumeration value="A2"/>  <!-- MANITOBA, CANADA -->
-			<xs:enumeration value="A3"/>  <!-- NEW BRUNSWICK, CANADA -->
-			<xs:enumeration value="A4"/>  <!-- NEWFOUNDLAND, CANADA -->
-			<xs:enumeration value="A5"/>  <!-- NOVA SCOTIA, CANADA -->
-			<xs:enumeration value="A6"/>  <!-- ONTARIO, CANADA -->
-			<xs:enumeration value="A7"/>  <!-- PRINCE EDWARD ISLAND, CANADA -->
-			<xs:enumeration value="A8"/>  <!-- QUEBEC, CANADA -->
-			<xs:enumeration value="A9"/>  <!-- SASKATCHEWAN, CANADA -->
-			<xs:enumeration value="B0"/>  <!-- YUKON, CANADA -->
-			<xs:enumeration value="Z4"/>  <!-- CANADA (FEDERAL LEVEL) -->
-			<xs:enumeration value="B2"/>  <!-- AFGHANISTAN -->
-			<xs:enumeration value="Y6"/>  <!-- ALAND ISLANDS -->
-			<xs:enumeration value="B3"/>  <!-- ALBANIA -->
-			<xs:enumeration value="B4"/>  <!-- ALGERIA -->
-			<xs:enumeration value="B5"/>  <!-- AMERICAN SAMOA -->
-			<xs:enumeration value="B6"/>  <!-- ANDORRA -->
-			<xs:enumeration value="B7"/>  <!-- ANGOLA -->
-			<xs:enumeration value="1A"/>  <!-- ANGUILLA -->
-			<xs:enumeration value="B8"/>  <!-- ANTARCTICA -->
-			<xs:enumeration value="B9"/>  <!-- ANTIGUA AND BARBUDA -->
-			<xs:enumeration value="C1"/>  <!-- ARGENTINA -->
-			<xs:enumeration value="1B"/>  <!-- ARMENIA -->
-			<xs:enumeration value="1C"/>  <!-- ARUBA -->
-			<xs:enumeration value="C3"/>  <!-- AUSTRALIA -->
-			<xs:enumeration value="C4"/>  <!-- AUSTRIA -->
-			<xs:enumeration value="1D"/>  <!-- AZERBAIJAN -->
-			<xs:enumeration value="C5"/>  <!-- BAHAMAS -->
-			<xs:enumeration value="C6"/>  <!-- BAHRAIN -->
-			<xs:enumeration value="C7"/>  <!-- BANGLADESH -->
-			<xs:enumeration value="C8"/>  <!-- BARBADOS -->
-			<xs:enumeration value="1F"/>  <!-- BELARUS -->
-			<xs:enumeration value="C9"/>  <!-- BELGIUM -->
-			<xs:enumeration value="D1"/>  <!-- BELIZE -->
-			<xs:enumeration value="G6"/>  <!-- BENIN -->
-			<xs:enumeration value="D0"/>  <!-- BERMUDA -->
-			<xs:enumeration value="D2"/>  <!-- BHUTAN -->
-			<xs:enumeration value="D3"/>  <!-- BOLIVIA -->
-			<xs:enumeration value="1E"/>  <!-- BOSNIA AND HERZEGOVINA -->
-			<xs:enumeration value="B1"/>  <!-- BOTSWANA -->
-			<xs:enumeration value="D4"/>  <!-- BOUVET ISLAND -->
-			<xs:enumeration value="D5"/>  <!-- BRAZIL -->
-			<xs:enumeration value="D6"/>  <!-- BRITISH INDIAN OCEAN TERRITORY -->
-			<xs:enumeration value="D9"/>  <!-- BRUNEI DARUSSALAM -->
-			<xs:enumeration value="E0"/>  <!-- BULGARIA -->
-			<xs:enumeration value="X2"/>  <!-- BURKINA FASO -->
-			<xs:enumeration value="E2"/>  <!-- BURUNDI -->
-			<xs:enumeration value="E3"/>  <!-- CAMBODIA -->
-			<xs:enumeration value="E4"/>  <!-- CAMEROON -->
-			<xs:enumeration value="E8"/>  <!-- CAPE VERDE -->
-			<xs:enumeration value="E9"/>  <!-- CAYMAN ISLANDS -->
-			<xs:enumeration value="F0"/>  <!-- CENTRAL AFRICAN REPUBLIC -->
-			<xs:enumeration value="F2"/>  <!-- CHAD -->
-			<xs:enumeration value="F3"/>  <!-- CHILE -->
-			<xs:enumeration value="F4"/>  <!-- CHINA -->
-			<xs:enumeration value="F6"/>  <!-- CHRISTMAS ISLAND -->
-			<xs:enumeration value="F7"/>  <!-- COCOS (KEELING) ISLANDS -->
-			<xs:enumeration value="F8"/>  <!-- COLOMBIA -->
-			<xs:enumeration value="F9"/>  <!-- COMOROS -->
-			<xs:enumeration value="G0"/>  <!-- CONGO -->
-			<xs:enumeration value="Y3"/>  <!-- CONGO, THE DEMOCRATIC REPUBLIC OF THE -->
-			<xs:enumeration value="G1"/>  <!-- COOK ISLANDS -->
-			<xs:enumeration value="G2"/>  <!-- COSTA RICA -->
-			<xs:enumeration value="L7"/>  <!-- COTE D'IVOIRE -->
-			<xs:enumeration value="1M"/>  <!-- CROATIA -->
-			<xs:enumeration value="G3"/>  <!-- CUBA -->
-			<xs:enumeration value="G4"/>  <!-- CYPRUS -->
-			<xs:enumeration value="2N"/>  <!-- CZECH REPUBLIC -->
-			<xs:enumeration value="G7"/>  <!-- DENMARK -->
-			<xs:enumeration value="1G"/>  <!-- DJIBOUTI -->
-			<xs:enumeration value="G9"/>  <!-- DOMINICA -->
-			<xs:enumeration value="G8"/>  <!-- DOMINICAN REPUBLIC -->
-			<xs:enumeration value="H1"/>  <!-- ECUADOR -->
-			<xs:enumeration value="H2"/>  <!-- EGYPT -->
-			<xs:enumeration value="H3"/>  <!-- EL SALVADOR -->
-			<xs:enumeration value="H4"/>  <!-- EQUATORIAL GUINEA -->
-			<xs:enumeration value="1J"/>  <!-- ERITREA -->
-			<xs:enumeration value="1H"/>  <!-- ESTONIA -->
-			<xs:enumeration value="H5"/>  <!-- ETHIOPIA -->
-			<xs:enumeration value="H7"/>  <!-- FALKLAND ISLANDS (MALVINAS) -->
-			<xs:enumeration value="H6"/>  <!-- FAROE ISLANDS -->
-			<xs:enumeration value="H8"/>  <!-- FIJI -->
-			<xs:enumeration value="H9"/>  <!-- FINLAND -->
-			<xs:enumeration value="I0"/>  <!-- FRANCE -->
-			<xs:enumeration value="I3"/>  <!-- FRENCH GUIANA -->
-			<xs:enumeration value="I4"/>  <!-- FRENCH POLYNESIA -->
-			<xs:enumeration value="2C"/>  <!-- FRENCH SOUTHERN TERRITORIES -->
-			<xs:enumeration value="I5"/>  <!-- GABON -->
-			<xs:enumeration value="I6"/>  <!-- GAMBIA -->
-			<xs:enumeration value="2Q"/>  <!-- GEORGIA -->
-			<xs:enumeration value="2M"/>  <!-- GERMANY -->
-			<xs:enumeration value="J0"/>  <!-- GHANA -->
-			<xs:enumeration value="J1"/>  <!-- GIBRALTAR -->
-			<xs:enumeration value="J3"/>  <!-- GREECE -->
-			<xs:enumeration value="J4"/>  <!-- GREENLAND -->
-			<xs:enumeration value="J5"/>  <!-- GRENADA -->
-			<xs:enumeration value="J6"/>  <!-- GUADELOUPE -->
-			<xs:enumeration value="GU"/>  <!-- GUAM -->
-			<xs:enumeration value="J8"/>  <!-- GUATEMALA -->
-			<xs:enumeration value="Y7"/>  <!-- GUERNSEY -->
-			<xs:enumeration value="J9"/>  <!-- GUINEA -->
-			<xs:enumeration value="S0"/>  <!-- GUINEA-BISSAU -->
-			<xs:enumeration value="K0"/>  <!-- GUYANA -->
-			<xs:enumeration value="K1"/>  <!-- HAITI -->
-			<xs:enumeration value="K4"/>  <!-- HEARD ISLAND AND MCDONALD ISLANDS -->
-			<xs:enumeration value="X4"/>  <!-- HOLY SEE (VATICAN CITY STATE) -->
-			<xs:enumeration value="K2"/>  <!-- HONDURAS -->
-			<xs:enumeration value="K3"/>  <!-- HONG KONG -->
-			<xs:enumeration value="K5"/>  <!-- HUNGARY -->
-			<xs:enumeration value="K6"/>  <!-- ICELAND -->
-			<xs:enumeration value="K7"/>  <!-- INDIA -->
-			<xs:enumeration value="K8"/>  <!-- INDONESIA -->
-			<xs:enumeration value="K9"/>  <!-- IRAN, ISLAMIC REPUBLIC OF -->
-			<xs:enumeration value="L0"/>  <!-- IRAQ -->
-			<xs:enumeration value="L2"/>  <!-- IRELAND -->
-			<xs:enumeration value="Y8"/>  <!-- ISLE OF MAN -->
-			<xs:enumeration value="L3"/>  <!-- ISRAEL -->
-			<xs:enumeration value="L6"/>  <!-- ITALY -->
-			<xs:enumeration value="L8"/>  <!-- JAMAICA -->
-			<xs:enumeration value="M0"/>  <!-- JAPAN -->
-			<xs:enumeration value="Y9"/>  <!-- JERSEY -->
-			<xs:enumeration value="M2"/>  <!-- JORDAN -->
-			<xs:enumeration value="1P"/>  <!-- KAZAKSTAN -->
-			<xs:enumeration value="M3"/>  <!-- KENYA -->
-			<xs:enumeration value="J2"/>  <!-- KIRIBATI -->
-			<xs:enumeration value="M4"/>  <!-- KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF -->
-			<xs:enumeration value="M5"/>  <!-- KOREA, REPUBLIC OF -->
-			<xs:enumeration value="M6"/>  <!-- KUWAIT -->
-			<xs:enumeration value="1N"/>  <!-- KYRGYZSTAN -->
-			<xs:enumeration value="M7"/>  <!-- LAO PEOPLE'S DEMOCRATIC REPUBLIC -->
-			<xs:enumeration value="1R"/>  <!-- LATVIA -->
-			<xs:enumeration value="M8"/>  <!-- LEBANON -->
-			<xs:enumeration value="M9"/>  <!-- LESOTHO -->
-			<xs:enumeration value="N0"/>  <!-- LIBERIA -->
-			<xs:enumeration value="N1"/>  <!-- LIBYAN ARAB JAMAHIRIYA -->
-			<xs:enumeration value="N2"/>  <!-- LIECHTENSTEIN -->
-			<xs:enumeration value="1Q"/>  <!-- LITHUANIA -->
-			<xs:enumeration value="N4"/>  <!-- LUXEMBOURG -->
-			<xs:enumeration value="N5"/>  <!-- MACAU -->
-			<xs:enumeration value="1U"/>  <!-- MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF -->
-			<xs:enumeration value="N6"/>  <!-- MADAGASCAR -->
-			<xs:enumeration value="N7"/>  <!-- MALAWI -->
-			<xs:enumeration value="N8"/>  <!-- MALAYSIA -->
-			<xs:enumeration value="N9"/>  <!-- MALDIVES -->
-			<xs:enumeration value="O0"/>  <!-- MALI -->
-			<xs:enumeration value="O1"/>  <!-- MALTA -->
-			<xs:enumeration value="1T"/>  <!-- MARSHALL ISLANDS -->
-			<xs:enumeration value="O2"/>  <!-- MARTINIQUE -->
-			<xs:enumeration value="O3"/>  <!-- MAURITANIA -->
-			<xs:enumeration value="O4"/>  <!-- MAURITIUS -->
-			<xs:enumeration value="2P"/>  <!-- MAYOTTE -->
-			<xs:enumeration value="O5"/>  <!-- MEXICO -->
-			<xs:enumeration value="1K"/>  <!-- MICRONESIA, FEDERATED STATES OF -->
-			<xs:enumeration value="1S"/>  <!-- MOLDOVA, REPUBLIC OF -->
-			<xs:enumeration value="O9"/>  <!-- MONACO -->
-			<xs:enumeration value="P0"/>  <!-- MONGOLIA -->
-			<xs:enumeration value="Z5"/>  <!-- MONTENEGRO -->
-			<xs:enumeration value="P1"/>  <!-- MONTSERRAT -->
-			<xs:enumeration value="P2"/>  <!-- MOROCCO -->
-			<xs:enumeration value="P3"/>  <!-- MOZAMBIQUE -->
-			<xs:enumeration value="E1"/>  <!-- MYANMAR -->
-			<xs:enumeration value="T6"/>  <!-- NAMIBIA -->
-			<xs:enumeration value="P5"/>  <!-- NAURU -->
-			<xs:enumeration value="P6"/>  <!-- NEPAL -->
-			<xs:enumeration value="P7"/>  <!-- NETHERLANDS -->
-			<xs:enumeration value="P8"/>  <!-- NETHERLANDS ANTILLES -->
-			<xs:enumeration value="1W"/>  <!-- NEW CALEDONIA -->
-			<xs:enumeration value="Q2"/>  <!-- NEW ZEALAND -->
-			<xs:enumeration value="Q3"/>  <!-- NICARAGUA -->
-			<xs:enumeration value="Q4"/>  <!-- NIGER -->
-			<xs:enumeration value="Q5"/>  <!-- NIGERIA -->
-			<xs:enumeration value="Q6"/>  <!-- NIUE -->
-			<xs:enumeration value="Q7"/>  <!-- NORFOLK ISLAND -->
-			<xs:enumeration value="1V"/>  <!-- NORTHERN MARIANA ISLANDS -->
-			<xs:enumeration value="Q8"/>  <!-- NORWAY -->
-			<xs:enumeration value="P4"/>  <!-- OMAN -->
-			<xs:enumeration value="R0"/>  <!-- PAKISTAN -->
-			<xs:enumeration value="1Y"/>  <!-- PALAU -->
-			<xs:enumeration value="1X"/>  <!-- PALESTINIAN TERRITORY, OCCUPIED -->
-			<xs:enumeration value="R1"/>  <!-- PANAMA -->
-			<xs:enumeration value="R2"/>  <!-- PAPUA NEW GUINEA -->
-			<xs:enumeration value="R4"/>  <!-- PARAGUAY -->
-			<xs:enumeration value="R5"/>  <!-- PERU -->
-			<xs:enumeration value="R6"/>  <!-- PHILIPPINES -->
-			<xs:enumeration value="R8"/>  <!-- PITCAIRN -->
-			<xs:enumeration value="R9"/>  <!-- POLAND -->
-			<xs:enumeration value="S1"/>  <!-- PORTUGAL -->
-			<xs:enumeration value="PR"/>  <!-- PUERTO RICO -->
-			<xs:enumeration value="S3"/>  <!-- QATAR -->
-			<xs:enumeration value="S4"/>  <!-- REUNION -->
-			<xs:enumeration value="S5"/>  <!-- ROMANIA -->
-			<xs:enumeration value="1Z"/>  <!-- RUSSIAN FEDERATION -->
-			<xs:enumeration value="S6"/>  <!-- RWANDA -->
-			<xs:enumeration value="Z0"/>  <!-- SAINT BARTHELEMY -->
-			<xs:enumeration value="U8"/>  <!-- SAINT HELENA -->
-			<xs:enumeration value="U7"/>  <!-- SAINT KITTS AND NEVIS -->
-			<xs:enumeration value="U9"/>  <!-- SAINT LUCIA -->
-			<xs:enumeration value="Z1"/>  <!-- SAINT MARTIN -->
-			<xs:enumeration value="V0"/>  <!-- SAINT PIERRE AND MIQUELON -->
-			<xs:enumeration value="V1"/>  <!-- SAINT VINCENT AND THE GRENADINES -->
-			<xs:enumeration value="Y0"/>  <!-- SAMOA -->
-			<xs:enumeration value="S8"/>  <!-- SAN MARINO -->
-			<xs:enumeration value="S9"/>  <!-- SAO TOME AND PRINCIPE -->
-			<xs:enumeration value="T0"/>  <!-- SAUDI ARABIA -->
-			<xs:enumeration value="T1"/>  <!-- SENEGAL -->
-			<xs:enumeration value="Z2"/>  <!-- SERBIA -->
-			<xs:enumeration value="T2"/>  <!-- SEYCHELLES -->
-			<xs:enumeration value="T8"/>  <!-- SIERRA LEONE -->
-			<xs:enumeration value="U0"/>  <!-- SINGAPORE -->
-			<xs:enumeration value="2B"/>  <!-- SLOVAKIA -->
-			<xs:enumeration value="2A"/>  <!-- SLOVENIA -->
-			<xs:enumeration value="D7"/>  <!-- SOLOMON ISLANDS -->
-			<xs:enumeration value="U1"/>  <!-- SOMALIA -->
-			<xs:enumeration value="T3"/>  <!-- SOUTH AFRICA -->
-			<xs:enumeration value="1L"/>  <!-- SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS -->
-			<xs:enumeration value="U3"/>  <!-- SPAIN -->
-			<xs:enumeration value="F1"/>  <!-- SRI LANKA -->
-			<xs:enumeration value="V2"/>  <!-- SUDAN -->
-			<xs:enumeration value="V3"/>  <!-- SURINAME -->
-			<xs:enumeration value="L9"/>  <!-- SVALBARD AND JAN MAYEN -->
-			<xs:enumeration value="V6"/>  <!-- SWAZILAND -->
-			<xs:enumeration value="V7"/>  <!-- SWEDEN -->
-			<xs:enumeration value="V8"/>  <!-- SWITZERLAND -->
-			<xs:enumeration value="V9"/>  <!-- SYRIAN ARAB REPUBLIC -->
-			<xs:enumeration value="F5"/>  <!-- TAIWAN, PROVINCE OF CHINA -->
-			<xs:enumeration value="2D"/>  <!-- TAJIKISTAN -->
-			<xs:enumeration value="W0"/>  <!-- TANZANIA, UNITED REPUBLIC OF -->
-			<xs:enumeration value="W1"/>  <!-- THAILAND -->
-			<xs:enumeration value="Z3"/>  <!-- TIMOR-LESTE -->
-			<xs:enumeration value="W2"/>  <!-- TOGO -->
-			<xs:enumeration value="W3"/>  <!-- TOKELAU -->
-			<xs:enumeration value="W4"/>  <!-- TONGA -->
-			<xs:enumeration value="W5"/>  <!-- TRINIDAD AND TOBAGO -->
-			<xs:enumeration value="W6"/>  <!-- TUNISIA -->
-			<xs:enumeration value="W8"/>  <!-- TURKEY -->
-			<xs:enumeration value="2E"/>  <!-- TURKMENISTAN -->
-			<xs:enumeration value="W7"/>  <!-- TURKS AND CAICOS ISLANDS -->
-			<xs:enumeration value="2G"/>  <!-- TUVALU -->
-			<xs:enumeration value="W9"/>  <!-- UGANDA -->
-			<xs:enumeration value="2H"/>  <!-- UKRAINE -->
-			<xs:enumeration value="C0"/>  <!-- UNITED ARAB EMIRATES -->
-			<xs:enumeration value="X0"/>  <!-- UNITED KINGDOM -->
-			<xs:enumeration value="2J"/>  <!-- UNITED STATES MINOR OUTLYING ISLANDS -->
-			<xs:enumeration value="X3"/>  <!-- URUGUAY -->
-			<xs:enumeration value="2K"/>  <!-- UZBEKISTAN -->
-			<xs:enumeration value="2L"/>  <!-- VANUATU -->
-			<xs:enumeration value="X5"/>  <!-- VENEZUELA -->
-			<xs:enumeration value="Q1"/>  <!-- VIET NAM -->
-			<xs:enumeration value="D8"/>  <!-- VIRGIN ISLANDS, BRITISH -->
-			<xs:enumeration value="VI"/>  <!-- VIRGIN ISLANDS, U.S. -->
-			<xs:enumeration value="X8"/>  <!-- WALLIS AND FUTUNA -->
-			<xs:enumeration value="U5"/>  <!-- WESTERN SAHARA -->
-			<xs:enumeration value="T7"/>  <!-- YEMEN -->
-			<xs:enumeration value="Y4"/>  <!-- ZAMBIA -->
-			<xs:enumeration value="Y5"/>  <!-- ZIMBABWE -->
-			<xs:enumeration value="XX"/>  <!-- UNKNOWN -->
-		</xs:restriction>
-    </xs:simpleType>
-</xs:schema>
+<?xml version="1.0"?>
+
+<!-- Filename: formDStateCodes.xsd.xml -->
+<!-- Purpose:  Define the set of EDGAR State and Country Codes specific to Form D. -->
+<!-- Version:  X301 -->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+           elementFormDefault="qualified">
+
+    <xs:simpleType name="STATE_COUNTRY_CODE">
+        <xs:annotation>
+            <xs:documentation>
+                Set of valid State and Country Codes according to EDGAR.
+            </xs:documentation>
+        </xs:annotation>
+		<xs:restriction base="xs:string">			
+			<xs:enumeration value="AL"/>  <!-- ALABAMA -->
+			<xs:enumeration value="AK"/>  <!-- ALASKA -->
+			<xs:enumeration value="AZ"/>  <!-- ARIZONA -->
+			<xs:enumeration value="AR"/>  <!-- ARKANSAS -->
+			<xs:enumeration value="CA"/>  <!-- CALIFORNIA -->
+			<xs:enumeration value="CO"/>  <!-- COLORADO -->
+			<xs:enumeration value="CT"/>  <!-- CONNECTICUT -->
+			<xs:enumeration value="DE"/>  <!-- DELAWARE -->
+			<xs:enumeration value="DC"/>  <!-- DISTRICT OF COLUMBIA -->
+			<xs:enumeration value="FL"/>  <!-- FLORIDA -->
+			<xs:enumeration value="GA"/>  <!-- GEORGIA -->
+			<xs:enumeration value="HI"/>  <!-- HAWAII -->
+			<xs:enumeration value="ID"/>  <!-- IDAHO -->
+			<xs:enumeration value="IL"/>  <!-- ILLINOIS -->
+			<xs:enumeration value="IN"/>  <!-- INDIANA -->
+			<xs:enumeration value="IA"/>  <!-- IOWA -->
+			<xs:enumeration value="KS"/>  <!-- KANSAS -->
+			<xs:enumeration value="KY"/>  <!-- KENTUCKY -->
+			<xs:enumeration value="LA"/>  <!-- LOUISIANA -->
+			<xs:enumeration value="ME"/>  <!-- MAINE -->
+			<xs:enumeration value="MD"/>  <!-- MARYLAND -->
+			<xs:enumeration value="MA"/>  <!-- MASSACHUSETTS -->
+			<xs:enumeration value="MI"/>  <!-- MICHIGAN -->
+			<xs:enumeration value="MN"/>  <!-- MINNESOTA -->
+			<xs:enumeration value="MS"/>  <!-- MISSISSIPPI -->
+			<xs:enumeration value="MO"/>  <!-- MISSOURI -->
+			<xs:enumeration value="MT"/>  <!-- MONTANA -->
+			<xs:enumeration value="NE"/>  <!-- NEBRASKA -->
+			<xs:enumeration value="NV"/>  <!-- NEVADA -->
+			<xs:enumeration value="NH"/>  <!-- NEW HAMPSHIRE -->
+			<xs:enumeration value="NJ"/>  <!-- NEW JERSEY -->
+			<xs:enumeration value="NM"/>  <!-- NEW MEXICO -->
+			<xs:enumeration value="NY"/>  <!-- NEW YORK -->
+			<xs:enumeration value="NC"/>  <!-- NORTH CAROLINA -->
+			<xs:enumeration value="ND"/>  <!-- NORTH DAKOTA -->
+			<xs:enumeration value="OH"/>  <!-- OHIO -->
+			<xs:enumeration value="OK"/>  <!-- OKLAHOMA -->
+			<xs:enumeration value="OR"/>  <!-- OREGON -->
+			<xs:enumeration value="PA"/>  <!-- PENNSYLVANIA -->
+			<xs:enumeration value="RI"/>  <!-- RHODE ISLAND -->
+			<xs:enumeration value="SC"/>  <!-- SOUTH CAROLINA -->
+			<xs:enumeration value="SD"/>  <!-- SOUTH DAKOTA -->
+			<xs:enumeration value="TN"/>  <!-- TENNESSEE -->
+			<xs:enumeration value="TX"/>  <!-- TEXAS -->
+			<xs:enumeration value="X1"/>  <!-- UNITED STATES -->
+			<xs:enumeration value="UT"/>  <!-- UTAH -->
+			<xs:enumeration value="VT"/>  <!-- VERMONT -->
+			<xs:enumeration value="VA"/>  <!-- VIRGINIA -->
+			<xs:enumeration value="WA"/>  <!-- WASHINGTON -->
+			<xs:enumeration value="WV"/>  <!-- WEST VIRGINIA -->
+			<xs:enumeration value="WI"/>  <!-- WISCONSIN -->
+			<xs:enumeration value="WY"/>  <!-- WYOMING -->
+			<xs:enumeration value="A0"/>  <!-- ALBERTA, CANADA -->
+			<xs:enumeration value="A1"/>  <!-- BRITISH COLUMBIA, CANADA -->
+			<xs:enumeration value="A2"/>  <!-- MANITOBA, CANADA -->
+			<xs:enumeration value="A3"/>  <!-- NEW BRUNSWICK, CANADA -->
+			<xs:enumeration value="A4"/>  <!-- NEWFOUNDLAND, CANADA -->
+			<xs:enumeration value="A5"/>  <!-- NOVA SCOTIA, CANADA -->
+			<xs:enumeration value="A6"/>  <!-- ONTARIO, CANADA -->
+			<xs:enumeration value="A7"/>  <!-- PRINCE EDWARD ISLAND, CANADA -->
+			<xs:enumeration value="A8"/>  <!-- QUEBEC, CANADA -->
+			<xs:enumeration value="A9"/>  <!-- SASKATCHEWAN, CANADA -->
+			<xs:enumeration value="B0"/>  <!-- YUKON, CANADA -->
+			<xs:enumeration value="Z4"/>  <!-- CANADA (FEDERAL LEVEL) -->
+			<xs:enumeration value="B2"/>  <!-- AFGHANISTAN -->
+			<xs:enumeration value="Y6"/>  <!-- ALAND ISLANDS -->
+			<xs:enumeration value="B3"/>  <!-- ALBANIA -->
+			<xs:enumeration value="B4"/>  <!-- ALGERIA -->
+			<xs:enumeration value="B5"/>  <!-- AMERICAN SAMOA -->
+			<xs:enumeration value="B6"/>  <!-- ANDORRA -->
+			<xs:enumeration value="B7"/>  <!-- ANGOLA -->
+			<xs:enumeration value="1A"/>  <!-- ANGUILLA -->
+			<xs:enumeration value="B8"/>  <!-- ANTARCTICA -->
+			<xs:enumeration value="B9"/>  <!-- ANTIGUA AND BARBUDA -->
+			<xs:enumeration value="C1"/>  <!-- ARGENTINA -->
+			<xs:enumeration value="1B"/>  <!-- ARMENIA -->
+			<xs:enumeration value="1C"/>  <!-- ARUBA -->
+			<xs:enumeration value="C3"/>  <!-- AUSTRALIA -->
+			<xs:enumeration value="C4"/>  <!-- AUSTRIA -->
+			<xs:enumeration value="1D"/>  <!-- AZERBAIJAN -->
+			<xs:enumeration value="C5"/>  <!-- BAHAMAS -->
+			<xs:enumeration value="C6"/>  <!-- BAHRAIN -->
+			<xs:enumeration value="C7"/>  <!-- BANGLADESH -->
+			<xs:enumeration value="C8"/>  <!-- BARBADOS -->
+			<xs:enumeration value="1F"/>  <!-- BELARUS -->
+			<xs:enumeration value="C9"/>  <!-- BELGIUM -->
+			<xs:enumeration value="D1"/>  <!-- BELIZE -->
+			<xs:enumeration value="G6"/>  <!-- BENIN -->
+			<xs:enumeration value="D0"/>  <!-- BERMUDA -->
+			<xs:enumeration value="D2"/>  <!-- BHUTAN -->
+			<xs:enumeration value="D3"/>  <!-- BOLIVIA -->
+			<xs:enumeration value="1E"/>  <!-- BOSNIA AND HERZEGOVINA -->
+			<xs:enumeration value="B1"/>  <!-- BOTSWANA -->
+			<xs:enumeration value="D4"/>  <!-- BOUVET ISLAND -->
+			<xs:enumeration value="D5"/>  <!-- BRAZIL -->
+			<xs:enumeration value="D6"/>  <!-- BRITISH INDIAN OCEAN TERRITORY -->
+			<xs:enumeration value="D9"/>  <!-- BRUNEI DARUSSALAM -->
+			<xs:enumeration value="E0"/>  <!-- BULGARIA -->
+			<xs:enumeration value="X2"/>  <!-- BURKINA FASO -->
+			<xs:enumeration value="E2"/>  <!-- BURUNDI -->
+			<xs:enumeration value="E3"/>  <!-- CAMBODIA -->
+			<xs:enumeration value="E4"/>  <!-- CAMEROON -->
+			<xs:enumeration value="E8"/>  <!-- CAPE VERDE -->
+			<xs:enumeration value="E9"/>  <!-- CAYMAN ISLANDS -->
+			<xs:enumeration value="F0"/>  <!-- CENTRAL AFRICAN REPUBLIC -->
+			<xs:enumeration value="F2"/>  <!-- CHAD -->
+			<xs:enumeration value="F3"/>  <!-- CHILE -->
+			<xs:enumeration value="F4"/>  <!-- CHINA -->
+			<xs:enumeration value="F6"/>  <!-- CHRISTMAS ISLAND -->
+			<xs:enumeration value="F7"/>  <!-- COCOS (KEELING) ISLANDS -->
+			<xs:enumeration value="F8"/>  <!-- COLOMBIA -->
+			<xs:enumeration value="F9"/>  <!-- COMOROS -->
+			<xs:enumeration value="G0"/>  <!-- CONGO -->
+			<xs:enumeration value="Y3"/>  <!-- CONGO, THE DEMOCRATIC REPUBLIC OF THE -->
+			<xs:enumeration value="G1"/>  <!-- COOK ISLANDS -->
+			<xs:enumeration value="G2"/>  <!-- COSTA RICA -->
+			<xs:enumeration value="L7"/>  <!-- COTE D'IVOIRE -->
+			<xs:enumeration value="1M"/>  <!-- CROATIA -->
+			<xs:enumeration value="G3"/>  <!-- CUBA -->
+			<xs:enumeration value="G4"/>  <!-- CYPRUS -->
+			<xs:enumeration value="2N"/>  <!-- CZECH REPUBLIC -->
+			<xs:enumeration value="G7"/>  <!-- DENMARK -->
+			<xs:enumeration value="1G"/>  <!-- DJIBOUTI -->
+			<xs:enumeration value="G9"/>  <!-- DOMINICA -->
+			<xs:enumeration value="G8"/>  <!-- DOMINICAN REPUBLIC -->
+			<xs:enumeration value="H1"/>  <!-- ECUADOR -->
+			<xs:enumeration value="H2"/>  <!-- EGYPT -->
+			<xs:enumeration value="H3"/>  <!-- EL SALVADOR -->
+			<xs:enumeration value="H4"/>  <!-- EQUATORIAL GUINEA -->
+			<xs:enumeration value="1J"/>  <!-- ERITREA -->
+			<xs:enumeration value="1H"/>  <!-- ESTONIA -->
+			<xs:enumeration value="H5"/>  <!-- ETHIOPIA -->
+			<xs:enumeration value="H7"/>  <!-- FALKLAND ISLANDS (MALVINAS) -->
+			<xs:enumeration value="H6"/>  <!-- FAROE ISLANDS -->
+			<xs:enumeration value="H8"/>  <!-- FIJI -->
+			<xs:enumeration value="H9"/>  <!-- FINLAND -->
+			<xs:enumeration value="I0"/>  <!-- FRANCE -->
+			<xs:enumeration value="I3"/>  <!-- FRENCH GUIANA -->
+			<xs:enumeration value="I4"/>  <!-- FRENCH POLYNESIA -->
+			<xs:enumeration value="2C"/>  <!-- FRENCH SOUTHERN TERRITORIES -->
+			<xs:enumeration value="I5"/>  <!-- GABON -->
+			<xs:enumeration value="I6"/>  <!-- GAMBIA -->
+			<xs:enumeration value="2Q"/>  <!-- GEORGIA -->
+			<xs:enumeration value="2M"/>  <!-- GERMANY -->
+			<xs:enumeration value="J0"/>  <!-- GHANA -->
+			<xs:enumeration value="J1"/>  <!-- GIBRALTAR -->
+			<xs:enumeration value="J3"/>  <!-- GREECE -->
+			<xs:enumeration value="J4"/>  <!-- GREENLAND -->
+			<xs:enumeration value="J5"/>  <!-- GRENADA -->
+			<xs:enumeration value="J6"/>  <!-- GUADELOUPE -->
+			<xs:enumeration value="GU"/>  <!-- GUAM -->
+			<xs:enumeration value="J8"/>  <!-- GUATEMALA -->
+			<xs:enumeration value="Y7"/>  <!-- GUERNSEY -->
+			<xs:enumeration value="J9"/>  <!-- GUINEA -->
+			<xs:enumeration value="S0"/>  <!-- GUINEA-BISSAU -->
+			<xs:enumeration value="K0"/>  <!-- GUYANA -->
+			<xs:enumeration value="K1"/>  <!-- HAITI -->
+			<xs:enumeration value="K4"/>  <!-- HEARD ISLAND AND MCDONALD ISLANDS -->
+			<xs:enumeration value="X4"/>  <!-- HOLY SEE (VATICAN CITY STATE) -->
+			<xs:enumeration value="K2"/>  <!-- HONDURAS -->
+			<xs:enumeration value="K3"/>  <!-- HONG KONG -->
+			<xs:enumeration value="K5"/>  <!-- HUNGARY -->
+			<xs:enumeration value="K6"/>  <!-- ICELAND -->
+			<xs:enumeration value="K7"/>  <!-- INDIA -->
+			<xs:enumeration value="K8"/>  <!-- INDONESIA -->
+			<xs:enumeration value="K9"/>  <!-- IRAN, ISLAMIC REPUBLIC OF -->
+			<xs:enumeration value="L0"/>  <!-- IRAQ -->
+			<xs:enumeration value="L2"/>  <!-- IRELAND -->
+			<xs:enumeration value="Y8"/>  <!-- ISLE OF MAN -->
+			<xs:enumeration value="L3"/>  <!-- ISRAEL -->
+			<xs:enumeration value="L6"/>  <!-- ITALY -->
+			<xs:enumeration value="L8"/>  <!-- JAMAICA -->
+			<xs:enumeration value="M0"/>  <!-- JAPAN -->
+			<xs:enumeration value="Y9"/>  <!-- JERSEY -->
+			<xs:enumeration value="M2"/>  <!-- JORDAN -->
+			<xs:enumeration value="1P"/>  <!-- KAZAKSTAN -->
+			<xs:enumeration value="M3"/>  <!-- KENYA -->
+			<xs:enumeration value="J2"/>  <!-- KIRIBATI -->
+			<xs:enumeration value="M4"/>  <!-- KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF -->
+			<xs:enumeration value="M5"/>  <!-- KOREA, REPUBLIC OF -->
+			<xs:enumeration value="M6"/>  <!-- KUWAIT -->
+			<xs:enumeration value="1N"/>  <!-- KYRGYZSTAN -->
+			<xs:enumeration value="M7"/>  <!-- LAO PEOPLE'S DEMOCRATIC REPUBLIC -->
+			<xs:enumeration value="1R"/>  <!-- LATVIA -->
+			<xs:enumeration value="M8"/>  <!-- LEBANON -->
+			<xs:enumeration value="M9"/>  <!-- LESOTHO -->
+			<xs:enumeration value="N0"/>  <!-- LIBERIA -->
+			<xs:enumeration value="N1"/>  <!-- LIBYAN ARAB JAMAHIRIYA -->
+			<xs:enumeration value="N2"/>  <!-- LIECHTENSTEIN -->
+			<xs:enumeration value="1Q"/>  <!-- LITHUANIA -->
+			<xs:enumeration value="N4"/>  <!-- LUXEMBOURG -->
+			<xs:enumeration value="N5"/>  <!-- MACAU -->
+			<xs:enumeration value="1U"/>  <!-- MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF -->
+			<xs:enumeration value="N6"/>  <!-- MADAGASCAR -->
+			<xs:enumeration value="N7"/>  <!-- MALAWI -->
+			<xs:enumeration value="N8"/>  <!-- MALAYSIA -->
+			<xs:enumeration value="N9"/>  <!-- MALDIVES -->
+			<xs:enumeration value="O0"/>  <!-- MALI -->
+			<xs:enumeration value="O1"/>  <!-- MALTA -->
+			<xs:enumeration value="1T"/>  <!-- MARSHALL ISLANDS -->
+			<xs:enumeration value="O2"/>  <!-- MARTINIQUE -->
+			<xs:enumeration value="O3"/>  <!-- MAURITANIA -->
+			<xs:enumeration value="O4"/>  <!-- MAURITIUS -->
+			<xs:enumeration value="2P"/>  <!-- MAYOTTE -->
+			<xs:enumeration value="O5"/>  <!-- MEXICO -->
+			<xs:enumeration value="1K"/>  <!-- MICRONESIA, FEDERATED STATES OF -->
+			<xs:enumeration value="1S"/>  <!-- MOLDOVA, REPUBLIC OF -->
+			<xs:enumeration value="O9"/>  <!-- MONACO -->
+			<xs:enumeration value="P0"/>  <!-- MONGOLIA -->
+			<xs:enumeration value="Z5"/>  <!-- MONTENEGRO -->
+			<xs:enumeration value="P1"/>  <!-- MONTSERRAT -->
+			<xs:enumeration value="P2"/>  <!-- MOROCCO -->
+			<xs:enumeration value="P3"/>  <!-- MOZAMBIQUE -->
+			<xs:enumeration value="E1"/>  <!-- MYANMAR -->
+			<xs:enumeration value="T6"/>  <!-- NAMIBIA -->
+			<xs:enumeration value="P5"/>  <!-- NAURU -->
+			<xs:enumeration value="P6"/>  <!-- NEPAL -->
+			<xs:enumeration value="P7"/>  <!-- NETHERLANDS -->
+			<xs:enumeration value="P8"/>  <!-- NETHERLANDS ANTILLES -->
+			<xs:enumeration value="1W"/>  <!-- NEW CALEDONIA -->
+			<xs:enumeration value="Q2"/>  <!-- NEW ZEALAND -->
+			<xs:enumeration value="Q3"/>  <!-- NICARAGUA -->
+			<xs:enumeration value="Q4"/>  <!-- NIGER -->
+			<xs:enumeration value="Q5"/>  <!-- NIGERIA -->
+			<xs:enumeration value="Q6"/>  <!-- NIUE -->
+			<xs:enumeration value="Q7"/>  <!-- NORFOLK ISLAND -->
+			<xs:enumeration value="1V"/>  <!-- NORTHERN MARIANA ISLANDS -->
+			<xs:enumeration value="Q8"/>  <!-- NORWAY -->
+			<xs:enumeration value="P4"/>  <!-- OMAN -->
+			<xs:enumeration value="R0"/>  <!-- PAKISTAN -->
+			<xs:enumeration value="1Y"/>  <!-- PALAU -->
+			<xs:enumeration value="1X"/>  <!-- PALESTINIAN TERRITORY, OCCUPIED -->
+			<xs:enumeration value="R1"/>  <!-- PANAMA -->
+			<xs:enumeration value="R2"/>  <!-- PAPUA NEW GUINEA -->
+			<xs:enumeration value="R4"/>  <!-- PARAGUAY -->
+			<xs:enumeration value="R5"/>  <!-- PERU -->
+			<xs:enumeration value="R6"/>  <!-- PHILIPPINES -->
+			<xs:enumeration value="R8"/>  <!-- PITCAIRN -->
+			<xs:enumeration value="R9"/>  <!-- POLAND -->
+			<xs:enumeration value="S1"/>  <!-- PORTUGAL -->
+			<xs:enumeration value="PR"/>  <!-- PUERTO RICO -->
+			<xs:enumeration value="S3"/>  <!-- QATAR -->
+			<xs:enumeration value="S4"/>  <!-- REUNION -->
+			<xs:enumeration value="S5"/>  <!-- ROMANIA -->
+			<xs:enumeration value="1Z"/>  <!-- RUSSIAN FEDERATION -->
+			<xs:enumeration value="S6"/>  <!-- RWANDA -->
+			<xs:enumeration value="Z0"/>  <!-- SAINT BARTHELEMY -->
+			<xs:enumeration value="U8"/>  <!-- SAINT HELENA -->
+			<xs:enumeration value="U7"/>  <!-- SAINT KITTS AND NEVIS -->
+			<xs:enumeration value="U9"/>  <!-- SAINT LUCIA -->
+			<xs:enumeration value="Z1"/>  <!-- SAINT MARTIN -->
+			<xs:enumeration value="V0"/>  <!-- SAINT PIERRE AND MIQUELON -->
+			<xs:enumeration value="V1"/>  <!-- SAINT VINCENT AND THE GRENADINES -->
+			<xs:enumeration value="Y0"/>  <!-- SAMOA -->
+			<xs:enumeration value="S8"/>  <!-- SAN MARINO -->
+			<xs:enumeration value="S9"/>  <!-- SAO TOME AND PRINCIPE -->
+			<xs:enumeration value="T0"/>  <!-- SAUDI ARABIA -->
+			<xs:enumeration value="T1"/>  <!-- SENEGAL -->
+			<xs:enumeration value="Z2"/>  <!-- SERBIA -->
+			<xs:enumeration value="T2"/>  <!-- SEYCHELLES -->
+			<xs:enumeration value="T8"/>  <!-- SIERRA LEONE -->
+			<xs:enumeration value="U0"/>  <!-- SINGAPORE -->
+			<xs:enumeration value="2B"/>  <!-- SLOVAKIA -->
+			<xs:enumeration value="2A"/>  <!-- SLOVENIA -->
+			<xs:enumeration value="D7"/>  <!-- SOLOMON ISLANDS -->
+			<xs:enumeration value="U1"/>  <!-- SOMALIA -->
+			<xs:enumeration value="T3"/>  <!-- SOUTH AFRICA -->
+			<xs:enumeration value="1L"/>  <!-- SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS -->
+			<xs:enumeration value="U3"/>  <!-- SPAIN -->
+			<xs:enumeration value="F1"/>  <!-- SRI LANKA -->
+			<xs:enumeration value="V2"/>  <!-- SUDAN -->
+			<xs:enumeration value="V3"/>  <!-- SURINAME -->
+			<xs:enumeration value="L9"/>  <!-- SVALBARD AND JAN MAYEN -->
+			<xs:enumeration value="V6"/>  <!-- SWAZILAND -->
+			<xs:enumeration value="V7"/>  <!-- SWEDEN -->
+			<xs:enumeration value="V8"/>  <!-- SWITZERLAND -->
+			<xs:enumeration value="V9"/>  <!-- SYRIAN ARAB REPUBLIC -->
+			<xs:enumeration value="F5"/>  <!-- TAIWAN, PROVINCE OF CHINA -->
+			<xs:enumeration value="2D"/>  <!-- TAJIKISTAN -->
+			<xs:enumeration value="W0"/>  <!-- TANZANIA, UNITED REPUBLIC OF -->
+			<xs:enumeration value="W1"/>  <!-- THAILAND -->
+			<xs:enumeration value="Z3"/>  <!-- TIMOR-LESTE -->
+			<xs:enumeration value="W2"/>  <!-- TOGO -->
+			<xs:enumeration value="W3"/>  <!-- TOKELAU -->
+			<xs:enumeration value="W4"/>  <!-- TONGA -->
+			<xs:enumeration value="W5"/>  <!-- TRINIDAD AND TOBAGO -->
+			<xs:enumeration value="W6"/>  <!-- TUNISIA -->
+			<xs:enumeration value="W8"/>  <!-- TURKEY -->
+			<xs:enumeration value="2E"/>  <!-- TURKMENISTAN -->
+			<xs:enumeration value="W7"/>  <!-- TURKS AND CAICOS ISLANDS -->
+			<xs:enumeration value="2G"/>  <!-- TUVALU -->
+			<xs:enumeration value="W9"/>  <!-- UGANDA -->
+			<xs:enumeration value="2H"/>  <!-- UKRAINE -->
+			<xs:enumeration value="C0"/>  <!-- UNITED ARAB EMIRATES -->
+			<xs:enumeration value="X0"/>  <!-- UNITED KINGDOM -->
+			<xs:enumeration value="2J"/>  <!-- UNITED STATES MINOR OUTLYING ISLANDS -->
+			<xs:enumeration value="X3"/>  <!-- URUGUAY -->
+			<xs:enumeration value="2K"/>  <!-- UZBEKISTAN -->
+			<xs:enumeration value="2L"/>  <!-- VANUATU -->
+			<xs:enumeration value="X5"/>  <!-- VENEZUELA -->
+			<xs:enumeration value="Q1"/>  <!-- VIET NAM -->
+			<xs:enumeration value="D8"/>  <!-- VIRGIN ISLANDS, BRITISH -->
+			<xs:enumeration value="VI"/>  <!-- VIRGIN ISLANDS, U.S. -->
+			<xs:enumeration value="X8"/>  <!-- WALLIS AND FUTUNA -->
+			<xs:enumeration value="U5"/>  <!-- WESTERN SAHARA -->
+			<xs:enumeration value="T7"/>  <!-- YEMEN -->
+			<xs:enumeration value="Y4"/>  <!-- ZAMBIA -->
+			<xs:enumeration value="Y5"/>  <!-- ZIMBABWE -->
+			<xs:enumeration value="XX"/>  <!-- UNKNOWN -->
+		</xs:restriction>
+    </xs:simpleType>
+</xs:schema>

From 3fca3c9792e40fa7e916d625f417f3e121bf3504 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sat, 30 Nov 2024 18:18:21 -0500
Subject: [PATCH 137/161] fix pre commit

---
 src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
index 0fec63c..ef43757 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -1 +1,3 @@
+"""Implement record linkage model between SEC companies and EIA utilities."""
+
 from . import preprocessing

From d53ab255c57df3bb46c38429d4e589189ace04f0 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sun, 1 Dec 2024 09:29:35 -0500
Subject: [PATCH 138/161] update python dependency in test environment

---
 test_environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_environment.yml b/test_environment.yml
index 5fa9b2d..c3d51d5 100644
--- a/test_environment.yml
+++ b/test_environment.yml
@@ -6,7 +6,7 @@ channels:
 dependencies:
   # Packages required for setting up the environment
   - pip>=21,<24
-  - python>=3.10,<3.12
+  - python>=3.10,<=3.12
   - setuptools>=66,<69
 
   # Packages specified in setup.py that need or benefit from binary conda packages

From 2eb1555a74f976205391247d03dd3d10614760eb Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Sun, 1 Dec 2024 10:21:48 -0500
Subject: [PATCH 139/161] update github tox env

---
 .github/workflows/tox-pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index 730fd08..4fff900 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -11,7 +11,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        python-version: ["3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false
     defaults:
       run:

From 44eb70beb63ced07af28eeba77e2676541099334 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 2 Dec 2024 13:31:27 -0500
Subject: [PATCH 140/161] restructure input table assets

---
 .../library/record_linkage_utils.py           | 109 ++++++++++++++
 src/mozilla_sec_eia/models/sec10k/__init__.py |  11 +-
 .../models/sec_eia_record_linkage/__init__.py |  50 ++++++-
 .../sec_eia_record_linkage/preprocessing.py   | 137 +-----------------
 .../sec_eia_splink_config.py                  |  57 ++++++++
 ...te_eia_input.py => transform_eia_input.py} |  43 +++++-
 .../transform_sec_input.py}                   | 112 +++++++-------
 7 files changed, 319 insertions(+), 200 deletions(-)
 create mode 100644 src/mozilla_sec_eia/library/record_linkage_utils.py
 create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
 rename src/mozilla_sec_eia/models/sec_eia_record_linkage/{create_eia_input.py => transform_eia_input.py} (70%)
 rename src/mozilla_sec_eia/models/{sec10k/sec_output_table.py => sec_eia_record_linkage/transform_sec_input.py} (85%)

diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py
new file mode 100644
index 0000000..9a33392
--- /dev/null
+++ b/src/mozilla_sec_eia/library/record_linkage_utils.py
@@ -0,0 +1,109 @@
+"""Utility functions for cleaning strings during modeling preprocessing steps."""
+
+import jellyfish
+import pandas as pd
+
+from pudl.analysis.record_linkage import name_cleaner
+
+INVALID_NAMES = [
+    "llc",
+    "limited liability company",
+    "limited",
+    "ltd",
+    "iiii",
+    "inc",
+    "incorporated",
+    "partnership",
+    "i",
+    "name",
+    "company",
+    "&",
+    "",
+]
+
+company_name_cleaner = name_cleaner.CompanyNameCleaner(
+    cleaning_rules_list=[
+        "remove_word_the_from_the_end",
+        "remove_word_the_from_the_beginning",
+        "replace_ampersand_by_AND",
+        "replace_hyphen_by_space",
+        "replace_underscore_by_space",
+        "remove_text_punctuation",
+        "remove_parentheses",
+        "remove_brackets",
+        "remove_curly_brackets",
+        "enforce_single_space_between_words",
+    ]
+)
+
+legal_term_remover = name_cleaner.CompanyNameCleaner(
+    cleaning_rules_list=[], handle_legal_terms=2
+)
+
+
+def clean_company_name(
+    df: pd.DataFrame, col_name: str = "company_name"
+) -> pd.DataFrame:
+    """Conduct cleaning on a company name column and add column without legal terms.
+
+    Uses the PUDL name cleaner object to do basic cleaning on `col_name` column
+    such as stripping punctuation, correcting case, normalizing legal
+    terms etc. The clean column becomes the `col_name` column and the original
+    `col_name` column is renamed to `{col_name}_raw`. Also adds a column called
+    `{col_name}_no_legal` which has legal terms stripped from the clean strings.
+
+    Arguments:
+        df: The dataframe that is to be cleaned. Must contain `col_name` column.
+        col_name: The name of the column with the company name strings.
+
+    Returns:
+        pd.DataFrame: The original dataframe with `col_name` now containing
+            cleaned strings and an additional column with the raw strings
+            and a column with the legal terms stripped from the company name.
+    """
+    df[col_name] = df[col_name].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA)
+    df.loc[:, f"{col_name}_clean"] = company_name_cleaner.apply_name_cleaning(
+        df[[col_name]]
+    ).str.strip()
+    df = df[df[f"{col_name}_clean"] != ""]
+    df = df.rename(columns={col_name: f"{col_name}_raw"}).rename(
+        columns={f"{col_name}_clean": col_name}
+    )
+    df.loc[:, f"{col_name}_no_legal"] = legal_term_remover.apply_name_cleaning(
+        df[[col_name]]
+    )
+    return df
+
+
+def drop_invalid_names(
+    df: pd.DataFrame, col_name: str = "company_name"
+) -> pd.DataFrame:
+    """Drop rows that have invalid company names, like just 'llc', or 'partnership'."""
+    return df[(~df[col_name].isin(INVALID_NAMES))]
+
+
+# TODO: this is in PUDL, deduplicate
+def get_metaphone_col(col: pd.Series) -> pd.Series:
+    """Get the metaphones of the strings in a column."""
+    return col.apply(jellyfish.metaphone)
+
+
+def transform_company_name(df: pd.DataFrame) -> pd.DataFrame:
+    """Apply cleaning, get metaphone col, drop invalid rows."""
+    df = clean_company_name(df)
+    df.loc[:, "company_name_mphone"] = get_metaphone_col(df["company_name_no_legal"])
+    df = drop_invalid_names(df, "company_name_clean")
+    return df
+
+
+def fill_street_address_nulls(
+    df: pd.DataFrame,
+    address_col: str = "street_address",
+    secondary_address_col: str = "street_address_2",
+) -> pd.DataFrame:
+    """Fill null street address with value from secondary address column."""
+    df[address_col] = pd.where(
+        (~df[address_col].isnull()) | (df[secondary_address_col].isnull()),
+        df[secondary_address_col],
+    )
+    return df
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 1bf9be9..4fd2f14 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -28,14 +28,14 @@
     mlflow_train_test_io_managers,
 )
 
-from . import basic_10k, ex_21, extract, sec_output_table
+from . import basic_10k, ex_21, extract
 from .utils.cloud import cloud_interface_resource
 
 basic_10k_assets = load_assets_from_modules([basic_10k])
 ex21_assets = load_assets_from_package_module(ex_21)
 ex21_data_assets = load_assets_from_modules([ex_21.data])
 shared_assets = load_assets_from_modules([extract])
-sec_output_assets = load_assets_from_modules([sec_output_table])
+
 
 basic_10k_production_job = model_jobs.create_production_model_job(
     "basic_10k_extraction",
@@ -57,9 +57,6 @@
     description="Run exhibit 21 extraction pipeline on archived filings.",
 )
 
-sec_output_table_production_job = model_jobs.create_production_model_job(
-    "sec_output_table_creation", sec_output_table.production_assets
-)
 
 finetune_layoutlm = define_dagstermill_asset(
     name="layoutlm",
@@ -139,8 +136,7 @@
         finetune_layoutlm,
         train_exhibit21_layout_classifier,
     ]
-    + ex21_data_assets
-    + sec_output_assets,
+    + ex21_data_assets,
     jobs=[
         basic_10k_production_job,
         basic_10k_validation_job,
@@ -148,7 +144,6 @@
         finetune_layoutlm_job,
         exhibit21_extraction_validation_job,
         exhibit21_layout_classifier_training_job,
-        sec_output_table_production_job,
     ],
     resources={
         "cloud_interface": cloud_interface_resource,
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
index ef43757..c87c0cb 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -1,3 +1,51 @@
 """Implement record linkage model between SEC companies and EIA utilities."""
 
-from . import preprocessing
+from dagster import Definitions, load_assets_from_modules
+from dagstermill import (
+    ConfigurableLocalOutputNotebookIOManager,
+)
+from upath import UPath
+
+from mozilla_sec_eia.library import model_jobs
+from mozilla_sec_eia.library.generic_io_managers import (
+    PandasParquetIOManager,
+    PickleUPathIOManager,
+)
+from mozilla_sec_eia.library.mlflow import (
+    MlflowPyfuncModelIOManager,
+    mlflow_interface_resource,
+    mlflow_train_test_io_managers,
+)
+from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource
+
+from . import transform_eia_input, transform_sec_input
+
+eia_assets = load_assets_from_modules([transform_eia_input])
+sec_assets = load_assets_from_modules([transform_sec_input])
+
+eia_input_table_production_job = model_jobs.create_production_model_job(
+    "eia_input_table_creation", transform_eia_input.production_assets
+)
+sec_input_table_production_job = model_jobs.create_production_model_job(
+    "sec_input_table_creation", transform_sec_input.production_assets
+)
+
+defs = Definitions(
+    sec_assets,
+    jobs=[eia_input_table_production_job, sec_input_table_production_job],
+    resources={
+        "cloud_interface": cloud_interface_resource,
+        "mlflow_interface": mlflow_interface_resource,
+        "pandas_parquet_io_manager": PandasParquetIOManager(
+            base_path=UPath("gs://sec10k-outputs/v2")
+        ),
+        "pickle_gcs_io_manager": PickleUPathIOManager(
+            base_path=UPath("gs://sec10k-outputs/dagster_storage")
+        ),
+        "pyfunc_model_io_manager": MlflowPyfuncModelIOManager(
+            mlflow_interface=mlflow_interface_resource
+        ),
+        "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(),
+    }
+    | mlflow_train_test_io_managers,
+)
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
index 12c4704..3caa182 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
@@ -1,14 +1,9 @@
 """Preprocessing for EIA and SEC input data before record linkage."""
 
-import re
-from importlib import resources
-from pathlib import Path
-
 import jellyfish
 import numpy as np
 import pandas as pd
 
-from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive
 from pudl.analysis.record_linkage import name_cleaner
 
 EIA_COL_MAP = {
@@ -16,8 +11,6 @@
     "address_2": "street_address_2",
 }
 
-EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"}
-
 SEC_COL_MAP = {
     "company_conformed_name": "company_name",
     "street_1": "street_address",
@@ -85,49 +78,6 @@
 )
 
 
-# TODO: remove
-def get_sec_state_code_dict():
-    """Create a dictionary mapping state codes to their names.
-
-    Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes
-    Published by SEC and reports valid state codes
-    for filers of Form D. Used to standardize the state codes
-    in the SEC 10K filings. The expanded names of the state codes
-    are comments in the XML file, so we have to read the XML in as
-    text and parse it.
-    """
-    # TODO: make a check to see if SEC has published a new version of this table
-    xml_filepath = (
-        resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml"
-    )
-    with Path.open(xml_filepath) as file:
-        xml_text = file.read()
-
-    pattern = r'<xs:enumeration value="(.*?)"/>.*?<!--\s*(.*?)\s*-->'
-    state_code_dict = {
-        code.lower(): name.lower()
-        for code, name in re.findall(pattern, xml_text, re.DOTALL)
-    }
-    return state_code_dict
-
-
-# TODO: moved to output table module, take out
-def _add_report_year_to_sec(sec_df):
-    """Merge metadata on to get a report year for extracted SEC data.
-
-    Expects filename to be the index of the SEC dataframe.
-    """
-    archive = GCSArchive()
-    md = archive.get_metadata()
-    sec_df = sec_df.merge(
-        md[["date_filed"]], how="left", left_index=True, right_index=True
-    )
-    sec_df.loc[:, "report_year"] = (
-        sec_df["report_date"].astype("datetime64[ns]").dt.year
-    )
-    return sec_df
-
-
 # TODO: this is in PUDL, pull out into helper function
 def _get_metaphone(row, col_name):
     if pd.isnull(row[col_name]):
@@ -135,7 +85,7 @@ def _get_metaphone(row, col_name):
     return jellyfish.metaphone(row[col_name])
 
 
-# TODO: deduplicate this with what's already been done
+# TODO: delete
 def _clean_company_name(df):
     df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
         df[["company_name"]]
@@ -150,7 +100,7 @@ def _clean_company_name(df):
     return df
 
 
-# TODO: deduplicate this with what's already been done
+# TODO: delete
 def clean_sec_df(df):
     """Shared cleaning for SEC 10K and Ex. 21 dataframes.
 
@@ -158,12 +108,6 @@ def clean_sec_df(df):
         df: Ex. 21 or SEC 10K basic info dataframe with columns
         company_name, loc_of_incorporation, and report_year.
     """
-    df[["company_name", "loc_of_incorporation"]] = (
-        df[["company_name", "loc_of_incorporation"]]
-        .fillna(pd.NA)
-        .apply(lambda x: x.str.strip().str.lower())
-        .replace("", pd.NA)
-    )
     df = _clean_company_name(df)
     df.loc[:, "company_name_mphone"] = df.apply(
         _get_metaphone, axis=1, args=("company_name_no_legal",)
@@ -177,40 +121,13 @@ def clean_sec_df(df):
     return df
 
 
-# TODO: moved to output table module, take out
-def _remove_weird_sec_cols(sec_df):
-    weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]
-    for weird_col in weird_cols:
-        if weird_col not in sec_df:
-            continue
-        normal_col = weird_col[1:]
-        sec_df.loc[:, normal_col] = sec_df[normal_col].where(
-            sec_df[weird_col].isnull(), sec_df[weird_col]
-        )
-        sec_df = sec_df.drop(columns=[weird_col])
-    return sec_df
-
-
-# TODO: for now split these into separate cleaning functions
-# later unite them into one cleaning function
+# TODO: delete
 def prepare_sec10k_basic_info_df(sec_df):
     """Preprocess SEC 10k basic information dataframe for record linkage."""
-    # sec_df = _add_report_year_to_sec(sec_df)
     sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index()
-    # state_code_to_name = get_sec_state_code_dict()
-    # sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace(
-    #     state_code_to_name
-    # )
-    # TODO: maybe shouldn't expand the state names and comparison should
-    # just be an exact match or nothing?
-    # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name)
-    # TODO: needs a record_id_sec column?
-    # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"})
-    # sec_df = _remove_weird_sec_cols(sec_df)
     sec_df = clean_sec_df(sec_df)
     sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
-    # TODO: cluster/mark these duplicates so they can be assigned
-    # IDs post matching
+    # TODO: does this actually drop anything?
     sec_df = sec_df.drop_duplicates(
         subset=[
             "central_index_key",
@@ -223,31 +140,17 @@ def prepare_sec10k_basic_info_df(sec_df):
             "zip_code",
         ]
     )
-    sec_df.loc[:, "sec_company_id"] = sec_df["central_index_key"]
     return sec_df
 
 
+# TODO: delete
 def prepare_ex21_df(ex21_df):
     """Preprocess Ex. 21 extracted dataframe for record linkage."""
-    ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
-    # TODO: move this to general preprocessing function?
-    state_code_to_name = get_sec_state_code_dict()
-    ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace(
-        state_code_to_name
-    )
-    name_to_state_code = {v: k for k, v in state_code_to_name.items()}
-    # need this?
-    ex21_df.loc[:, "state_of_incorporation"] = ex21_df["loc_of_incorporation"].replace(
-        name_to_state_code
-    )
     ex21_df = clean_sec_df(ex21_df)
-    ex21_df = ex21_df.drop_duplicates(
-        subset=["company_name", "loc_of_incorporation", "report_year"]
-    )
-    # ex21_df = ex21_df.reset_index(drop=True).reset_index(names="record_id")
     return ex21_df
 
 
+# TODO: delete
 def prepare_eia_df(eia_df):
     """Preprocess EIA utility dataframe for record linkage."""
     eia_df = eia_df.rename(columns=EIA_COL_MAP)
@@ -262,31 +165,3 @@ def prepare_eia_df(eia_df):
     )
     eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id")
     return eia_df
-
-
-def add_sec_company_id_to_subsidiaries(ex21_df: pd.DataFrame):
-    """Add sec_company_id onto SEC Ex. 21 subsidiaries.
-
-    At this point, the passed in Ex. 21 dataframe should have been
-    matched to SEC 10K filers with record linkage and assigned a CIK
-    where applicable (if the subsidiary files with the SEC). Take the
-    subsidiaries that don't have a CIK and create an sec_company_id
-    for those companies.
-
-    Arguments:
-        ex21_df: A dataframe of subsidiaries from SEC Ex. 21 filings with
-        columns subsidiary_cik, company_name (of the subsidiary),
-        and loc_of_incorporation.
-    """
-    ex21_df = ex21_df.sort_values(by="parent_cik")
-    ex21_df = ex21_df.drop_duplicates(subset=["company_name", "loc_of_incorporation"])
-    ex21_df.loc[:, "sec_company_id"] = (
-        ex21_df["parent_cik"]
-        + "_"
-        + (ex21_df.groupby("parent_cik").cumcount() + 1).astype(str)
-    )
-    # override sec_company_id with CIK where a subsidiary has an assigned CIK
-    ex21_df.loc[:, "sec_company_id"] = ex21_df["sec_company_id"].where(
-        ex21_df["subsidiary_cik"].isnull(), ex21_df["subsidiary_cik"]
-    )
-    return ex21_df
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
new file mode 100644
index 0000000..3a5edae
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
@@ -0,0 +1,57 @@
+"""Configuration file for the splink SEC to EIA record linkage model."""
+
+import splink.comparison_library as cl
+from splink import block_on
+
+STR_COLS = [
+    "company_name",
+    "street_address",
+    "street_address_2",
+    "city",
+    "state",
+    "zip_code",
+]
+
+SHARED_COLS = [
+    "record_id",
+    "report_date",
+    "report_year",
+    "company_name",
+    "company_name_no_legal",
+    "company_name_mphone",
+    "street_address",
+    "street_address_2",
+    "city",
+    "state",  # could use state of incorporation from SEC
+    "zip_code",
+    "phone_number",
+]
+
+MATCH_COLS = ["company_name", "state", "city", "street_address"]
+
+BLOCKING_RULES = [
+    "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)",
+    "l.street_address = r.street_address",
+    "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city",
+    "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2",
+]
+
+company_name_comparison = cl.NameComparison(
+    "company_name_no_legal",
+    jaro_winkler_thresholds=[0.95],
+)
+
+address_comparison = cl.LevenshteinAtThresholds(
+    "street_address", distance_threshold_or_thresholds=[1]
+).configure(term_frequency_adjustments=True)
+print(address_comparison.get_comparison("duckdb").human_readable_description)
+
+state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True)
+city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9])
+
+# blocking rules for estimating probability two random records match
+deterministic_blocking_rules = [
+    block_on("company_name_mphone", "company_name_mphone"),
+    "jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city",
+    "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address",
+]
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
similarity index 70%
rename from src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py
rename to src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
index d0266b9..4da5c1b 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
@@ -1,10 +1,23 @@
 """Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies."""
 
+import numpy as np
 import pandas as pd
+from dagster import AssetOut, multi_asset
+
+from mozilla_sec_eia.library.record_linkage_utils import (
+    fill_street_address_nulls,
+    transform_company_name,
+)
+from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
+
+EIA_COL_MAP = {
+    "utility_name_eia": "company_name",  # TODO: should be linking to owner or operator name?
+    "address_2": "street_address_2",
+}
 
 
 # TODO: make Dagster inputs instead of reading from AWS?
-def get_eia861_utilities_table():
+def harvest_eia861_utilities():
     """Get the utilities contained in EIA Form 861.
 
     TODO: In PUDL we should eventually implement an actual thorough
@@ -59,18 +72,36 @@ def get_eia861_utilities_table():
     return eia861_df
 
 
+@multi_asset(
+    outs={
+        "core_eia__parents_and_subsidiaries": AssetOut(
+            io_manager_key="pandas_parquet_io_manager"
+        )
+        # TODO: allow year partitions?
+    }
+)
 # TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS?
-def get_eia_utilities_table():
+def eia_rl_input_table():
     """Create a table of EIA Form 860 and 861 utilities."""
     raw_eia_df = pd.read_parquet(
         "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet"
     )
-    eia861_df = get_eia861_utilities_table()
+    eia861_df = harvest_eia861_utilities()
     eia_df = pd.concat([raw_eia_df, eia861_df])
     eia_df = eia_df.drop_duplicates(
         subset=["utility_id_eia", "report_date"], keep="first"
-    )
+    ).dropna(subset="utility_name_eia")
+    eia_df = eia_df.rename(columns=EIA_COL_MAP)
     eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]")
-    # there are nulls from non harvested 861 utilities
-    eia_df = eia_df.dropna(subset="utility_name_eia")
+    eia_df.loc[:, "report_year"] = eia_df["report_date"].dt.year
+    eia_df = transform_company_name(eia_df)
+    eia_df.loc[:, "zip_code"] = eia_df["zip_code"].str[:5]
+    eia_df = fill_street_address_nulls(eia_df)
+    eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
+    eia_df = eia_df.fillna(np.nan)
+    eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id")
+
     return eia_df
+
+
+production_assets = [eia_rl_input_table]
diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
similarity index 85%
rename from src/mozilla_sec_eia/models/sec10k/sec_output_table.py
rename to src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index 1ccaea9..7c4aab0 100644
--- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -9,20 +9,32 @@
 import pandas as pd
 from dagster import AssetIn, AssetOut, multi_asset
 
+from mozilla_sec_eia.library.record_linkage_utils import (
+    fill_street_address_nulls,
+    transform_company_name,
+)
 from mozilla_sec_eia.models.sec10k.utils.cloud import (
     convert_ex21_id_to_filename,
 )
-from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import (
-    company_name_cleaner,
-)
+from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
 
-from .extract import (
+from ..sec10k.extract import (
     sec10k_filing_metadata,
     year_quarter_partitions,
 )
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
+
+EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"}
+SEC_COL_MAP = {
+    "company_conformed_name": "company_name",
+    "street_1": "street_address",
+    "street_2": "street_address_2",
+    "zip": "zip_code",
+    "business_phone": "phone_number",
+}
+
 INVALID_NAMES = [
     "llc",
     "limited liability company",
@@ -130,29 +142,7 @@ def clean_location_of_inc(df) -> pd.DataFrame:
     return df
 
 
-def clean_company_name(df) -> pd.DataFrame:
-    """Clean company name column in SEC basic 10K or Ex. 21 dataframe.
-
-    Arguments:
-        df: Ex. 21 or SEC 10K basic info dataframe with company_name
-            column.
-    """
-    df["company_name"] = (
-        df["company_name"].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA)
-    )
-    df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
-        df[["company_name"]]
-    ).str.strip()
-    df = df[
-        (~df["company_name"].isin(INVALID_NAMES))
-        & (~df["company_name_clean"].isin(INVALID_NAMES))
-    ]
-    df = df.fillna(np.nan)
-
-    return df
-
-
-def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
+def _add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame:
     """Add the CIK of the parent company to Ex. 21 subsidiaries."""
     ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename(
         columns={"cik": "parent_company_cik"}
@@ -276,48 +266,74 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame:
         "ex21_df": AssetIn("ex21_company_ownership_info"),
     },
     outs={
-        "clean_ex21_subsidiary_table": AssetOut(
+        "transformed_ex21_subsidiary_table": AssetOut(
             io_manager_key="pandas_parquet_io_manager",
         )
     },
     partitions_def=year_quarter_partitions,
 )
-def clean_ex21_table(
+def transform_ex21_table(
     ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
 ) -> pd.DataFrame:
-    """Clean Ex. 21 table of subsidiaries before combining with basic 10k table."""
+    """Transform Ex. 21 table of subsidiaries before combining with basic 10k table."""
     ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
     ex21_df = ex21_df.drop(columns=["id"])
     ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
-    ex21_df = ex21_df.rename(
-        columns={"subsidiary": "company_name", "loc": "location_of_inc"}
-    )
+    ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
     ex21_df = clean_location_of_inc(ex21_df)
-    ex21_df = clean_company_name(ex21_df)
-    ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata)
+    # TODO: what to do with the clean company name?
+    ex21_df = transform_company_name(ex21_df)
+    ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata)
     # add an sec_company_id, ultimately this ID become the subsidiary's CIK
     # if the subsidiary is matched to an SEC filer
     ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df)
     ex21_df = _flatten_sec_companies_across_time(ex21_df)
+    ex21_df = ex21_df.fillna(np.nan)
 
     return ex21_df
 
 
+def transform_basic10k_table(
+    basic_10k_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
+) -> pd.DataFrame:
+    """Transformations on SEC basic 10K filer table to prepare for record linkage."""
+    basic_10k_df = basic_10k_df.reset_index().pivot_table(
+        values="value", index="filename", columns="key", aggfunc="first"
+    )
+    basic_10k_df.columns.name = None
+    # TODO: chain these function calls together
+    basic_10k_df = basic_10k_df.reset_index()
+    basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
+    basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata)
+    basic_10k_df = basic_10k_df.rename(columns=SEC_COL_MAP)
+    # add a location of incorporation to better match it to Ex. 21 subsidiaries
+    basic_10k_df = clean_location_of_inc(basic_10k_df)
+    basic_10k_df = transform_company_name(basic_10k_df)
+    basic_10k_df.loc[:, "zip_code"] = basic_10k_df["zip_code"].str[:5]
+    basic_10k_df = fill_street_address_nulls(basic_10k_df)
+    basic_10k_df.loc[:, "files_10k"] = True
+    basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"]
+    basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply(
+        lambda x: x.str.strip().str.lower()
+    )
+    return basic_10k_df
+
+
 @multi_asset(
     ins={
         "basic_10k_df": AssetIn("basic_10k_company_info"),
-        "clean_ex21_df": AssetIn("clean_ex21_subsidiary_table"),
+        "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
         # specify an io_manager_key?
     },
     outs={
-        "out_sec_10k__parents_and_subsidiaries": AssetOut(
+        "core_sec_10k__parents_and_subsidiaries": AssetOut(
             io_manager_key="pandas_parquet_io_manager",
             # specify a dagster_type?
         ),
     },
     partitions_def=year_quarter_partitions,
 )
-def sec_output_table(
+def sec_rl_input_table(
     basic_10k_df: pd.DataFrame,
     clean_ex21_df: pd.DataFrame,
     sec10k_filing_metadata: pd.DataFrame,
@@ -329,19 +345,7 @@ def sec_output_table(
     filing companies. Create an sec_company_id for subsidiaries that aren't linked
     to a CIK.
     """
-    basic_10k_df = basic_10k_df.reset_index().pivot_table(
-        values="value", index="filename", columns="key", aggfunc="first"
-    )
-    basic_10k_df.columns.name = None
-    basic_10k_df = basic_10k_df.reset_index()
-    basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
-    basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata)
-    # add a location of incorporation to better match it to Ex. 21 subsidiaries
-    basic_10k_df = clean_location_of_inc(basic_10k_df)
-    basic_10k_df = basic_10k_df.rename(
-        columns={"company_conformed_name": "company_name"}
-    )
-    basic_10k_df = clean_company_name(basic_10k_df)
+    basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata)
     ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
         basic10k_df=basic_10k_df, ex21_df=clean_ex21_df
     )
@@ -350,18 +354,18 @@ def sec_output_table(
         how="left",
         on="central_index_key",
     )
-    basic_10k_df.loc[:, "files_10k"] = True
-    basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"]
     # get the subsidiary companies that weren't matched to a 10K filing company
     ex21_non_filing_subs_df = ex21_df_with_cik[
         ex21_df_with_cik["central_index_key"].isnull()
     ]
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
     out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df])
+    out_df = out_df.fillna(np.nan)
     # this drops records for earlier company names and addresses
     # that have since changed, so we lose some information
     out_df = _flatten_sec_companies_across_time(out_df)
+
     return out_df
 
 
-production_assets = [sec_output_table, sec10k_filing_metadata]
+production_assets = [sec_rl_input_table, sec10k_filing_metadata]

From 7563df8e57bcbf34296c4bcde674d63ae145426f Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 2 Dec 2024 13:37:20 -0500
Subject: [PATCH 141/161] include pseudo code of SEC output table module

---
 .../models/sec_eia_record_linkage/sec_output_table.py  | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py
new file mode 100644
index 0000000..7f974ad
--- /dev/null
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py
@@ -0,0 +1,10 @@
+"""Module for creating the SEC company output table which connects to EIA company data."""
+
+
+# the input to this method is "core_sec_10k__parents_and_subsidiaries"
+def sec_output_table():
+    """Connect SEC to EIA and format an output table."""
+    # run record linkage to connect SEC to EIA?
+    # add a utility_id_eia column onto the core table
+    # drop the following columns: company_name_no_legal, company_name_mphone, any other intermediate columns
+    pass

From 3c88ff2f9d0a2578e5bc337aef5075e8575e4952 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 3 Dec 2024 11:28:37 -0500
Subject: [PATCH 142/161] Try using conda env to run tox

---
 .github/workflows/tox-pytest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index 4fff900..f03718e 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -11,7 +11,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.11", "3.12"]
       fail-fast: false
     defaults:
       run:
@@ -70,7 +70,7 @@ jobs:
 
       - name: Run PyTest with Tox
         run: |
-          tox
+          conda run -n mozilla-sec-eia tox
 
       - name: Upload test coverage report to CodeCov
         uses: codecov/codecov-action@v5

From fb3d772ba21b50e78fa98abe3d922887bc9d1509 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 3 Dec 2024 11:37:11 -0500
Subject: [PATCH 143/161] Add PUDL dependency and restrict to Py3.12

---
 .github/workflows/tox-pytest.yml | 2 +-
 pyproject.toml                   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index f03718e..cf278df 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -11,7 +11,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        python-version: ["3.11", "3.12"]
+        python-version: ["3.12"]
       fail-fast: false
     defaults:
       run:
diff --git a/pyproject.toml b/pyproject.toml
index a87becb..72536e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dynamic = ["version"]
 license = {file = "LICENSE.txt"}
 dependencies = [
     "accelerate>=0.21.0,<2.0", # Hugging Face dependency for PyTorch models
+    "catalystcoop.pudl @ git+https://github.com/catalyst-cooperative/pudl.git",
     "cloud-sql-python-connector[pg8000]",
     "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things
     "dagster-mlflow",

From 390770fb58760d1c37ac8ffdd614498401524f77 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 3 Dec 2024 11:40:30 -0500
Subject: [PATCH 144/161] Guess you don't need to specify tox env with
 setup-micromamba

---
 .github/workflows/tox-pytest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
index cf278df..ac346f4 100644
--- a/.github/workflows/tox-pytest.yml
+++ b/.github/workflows/tox-pytest.yml
@@ -70,7 +70,7 @@ jobs:
 
       - name: Run PyTest with Tox
         run: |
-          conda run -n mozilla-sec-eia tox
+          tox
 
       - name: Upload test coverage report to CodeCov
         uses: codecov/codecov-action@v5

From 8e8beeeee11071344ef766dd9153a985e5387965 Mon Sep 17 00:00:00 2001
From: Dazhong Xia <dazhong.xia@catalyst.coop>
Date: Tue, 3 Dec 2024 11:45:33 -0500
Subject: [PATCH 145/161] Install GDAL version via conda since we rely on PUDL
 now

---
 test_environment.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_environment.yml b/test_environment.yml
index c3d51d5..f54968d 100644
--- a/test_environment.yml
+++ b/test_environment.yml
@@ -29,6 +29,10 @@ dependencies:
   - pytorch>=2.2,<3
   - torchvision
 
+  # GDAL is a transitive dependency whose binaries must match those installed by the
+  # pudl-dev conda environment, so we also install it with conda here.
+  - gdal==3.9.3 # pinned to ensure it matches pudl-dev environment exactly.
+
   # Use pip to install the package defined by this repo for development:
   - pip:
       - --editable ./[dev,docs,tests,types]

From 50e7b7edcd9f8612fde43fc7576d8e72e6e82dc5 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Tue, 3 Dec 2024 15:02:04 -0800
Subject: [PATCH 146/161] notebook has cells for SEC and EIA hook up

---
 notebooks/18-kl-splink-sec-eia.ipynb | 1267 ++++++++------------------
 1 file changed, 378 insertions(+), 889 deletions(-)

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 19ab082..2fdeb79 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -40,10 +40,7 @@
    "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d",
    "metadata": {},
    "source": [
-    "# Inputs\n",
-    "\n",
-    "Questions:\n",
-    "* What's the best way to dagsterize this to get EIA data from PUDL?"
+    "# Inputs"
    ]
   },
   {
@@ -55,140 +52,157 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "4ab5594d-7d1f-425d-80e1-92c30be73011",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "3fb7895f-10c5-4450-96f9-77b36471b53e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "eia_df = raw_eia_df.copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "06c76b82-1aad-47b2-aecc-6225a286cc40",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "13d543e7-334c-4606-849b-c8d60ad668d2",
+   "metadata": {},
    "source": [
-    "harvested_df = pd.concat([\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "    pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n",
-    "])"
+    "TODO: materialize asset and read in from Dagster GCS storage"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 13,
+   "id": "7f3e5fdd-2c16-4dc0-8ad1-cf4516fbee33",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])"
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.create_eia_input import get_eia_utilities_table"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "3b7484de-bbc7-47ba-b408-a1af1183018c",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 14,
+   "id": "70ebf6dc-ed00-4f78-bbaf-2805860a1b63",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n",
-    "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n",
-    "                how=\"left\", \n",
-    "                left_on=[\"report_date\", \"utility_name_eia\"],\n",
-    "                right_on=[\"report_date\", \"new_parent\"]\n",
-    "               )\n",
-    "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n",
-    "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()"
+    "eia_df = get_eia_utilities_table()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
+   "execution_count": 28,
+   "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "      <th>utility_id_pudl</th>\n",
+       "      <th>utility_name_eia</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>city</th>\n",
+       "      <th>state</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>plants_reported_owner</th>\n",
+       "      <th>plants_reported_operator</th>\n",
+       "      <th>plants_reported_asset_manager</th>\n",
+       "      <th>plants_reported_other_relationship</th>\n",
+       "      <th>entity_type</th>\n",
+       "      <th>attention_line</th>\n",
+       "      <th>address_2</th>\n",
+       "      <th>zip_code_4</th>\n",
+       "      <th>contact_firstname</th>\n",
+       "      <th>contact_lastname</th>\n",
+       "      <th>contact_title</th>\n",
+       "      <th>phone_number</th>\n",
+       "      <th>phone_extension</th>\n",
+       "      <th>contact_firstname_2</th>\n",
+       "      <th>contact_lastname_2</th>\n",
+       "      <th>contact_title_2</th>\n",
+       "      <th>phone_number_2</th>\n",
+       "      <th>phone_extension_2</th>\n",
+       "      <th>data_maturity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>66550</td>\n",
+       "      <td>16573.0</td>\n",
+       "      <td>Telyon AMZ Windsor LLC</td>\n",
+       "      <td>2024-01-01</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>monthly_update</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   utility_id_eia  utility_id_pudl        utility_name_eia report_date street_address  city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2   data_maturity\n",
+       "0           66550          16573.0  Telyon AMZ Windsor LLC  2024-01-01           None  None  None     None                  None                     None                          None                               None        None           None      None       None              None             None          None         None            None                None               None            None           None              None  monthly_update"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])\n",
-    "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])"
+    "eia_df.head(1)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4",
+   "cell_type": "markdown",
+   "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "eia_df = pd.concat([eia_df, eia861_df])\n",
-    "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")\n",
-    "# not sure at what point this stops being a datetime\n",
-    "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")\n",
-    "# there are nulls from non harvested 861 utilities\n",
-    "eia_df = eia_df.dropna(subset=\"utility_name_eia\")"
+    "### SEC 10K Basic Info"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
+   "id": "012db270-d944-464c-9d30-c5995ab491a4",
    "metadata": {},
    "source": [
-    "### SEC 10K Basic Info"
+    "TODO: read in asset from Dagster GCS storage"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 24,
    "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7",
    "metadata": {},
    "outputs": [],
@@ -198,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 25,
    "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821",
    "metadata": {},
    "outputs": [],
@@ -211,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "id": "1be3364e-9887-42b2-b303-0a24e8681acf",
    "metadata": {
     "tags": []
@@ -222,6 +236,127 @@
     "raw_sec_df.columns.name = None"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "5fcb05e5-6a57-439f-802f-527242f8f223",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>]fiscal_year_end</th>\n",
+       "      <th>]irs_number</th>\n",
+       "      <th>]state_of_incorporation</th>\n",
+       "      <th>business_phone</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>city</th>\n",
+       "      <th>company_conformed_name</th>\n",
+       "      <th>date_of_name_change</th>\n",
+       "      <th>film_number</th>\n",
+       "      <th>fiscal_year_end</th>\n",
+       "      <th>form_type</th>\n",
+       "      <th>former_conformed_name</th>\n",
+       "      <th>irs_number</th>\n",
+       "      <th>organization_name</th>\n",
+       "      <th>sec_act</th>\n",
+       "      <th>sec_file_number</th>\n",
+       "      <th>standard_industrial_classification</th>\n",
+       "      <th>state</th>\n",
+       "      <th>state_of_incorporation</th>\n",
+       "      <th>street_1</th>\n",
+       "      <th>street_2</th>\n",
+       "      <th>zip</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>filename</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>edgar/data/1000015/0000912057-00-014793.txt</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2039736700</td>\n",
+       "      <td>0001000015</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>meta group inc</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>585471</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>060971675</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>000-27280</td>\n",
+       "      <td>services-engineering, accounting, research, ma...</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>de</td>\n",
+       "      <td>208 harbor dr</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>06912-0061</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            ]fiscal_year_end ]irs_number ]state_of_incorporation business_phone central_index_key      city company_conformed_name date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number                 standard_industrial_classification state state_of_incorporation       street_1 street_2         zip\n",
+       "filename                                                                                                                                                                                                                                                                                                                                                                                                                   \n",
+       "edgar/data/1000015/0000912057-00-014793.txt              NaN         NaN                     NaN     2039736700        0001000015  stamford         meta group inc                 NaN      585471            1231      10-k                   NaN  060971675               NaN     NaN       000-27280  services-engineering, accounting, research, ma...    ct                     de  208 harbor dr      NaN  06912-0061"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_sec_df.head(1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 15,
@@ -232,6 +367,16 @@
     "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "329e5d07-4eb4-4ba2-968e-aabf9be4937b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_asset_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2",
@@ -240,6 +385,14 @@
     "### Ex. 21"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "ae57370a-36bb-40cf-b9f1-8ffdf373fa22",
+   "metadata": {},
+   "source": [
+    "TODO: get rid of this section"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 17,
@@ -277,6 +430,14 @@
     "# Preprocess Ex. 21"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "917c79d4-9250-46a7-855a-14e526bbce6c",
+   "metadata": {},
+   "source": [
+    "TODO: get rid of this section"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,
@@ -527,32 +688,6 @@
     "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "56f41505-421e-4bf7-bfc4-93500e0c5e71",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0    a_1\n",
-       "1    b_2\n",
-       "2    c_3\n",
-       "dtype: object"
-      ]
-     },
-     "execution_count": 53,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df1 = pd.DataFrame({\"text1\": [\"a\", \"b\", \"c\"]})\n",
-    "df2 = pd.DataFrame({\"text2\": [\"1\", \"2\", \"3\"]})\n",
-    "df1[\"text1\"] + \"_\" + df2[\"text2\"]"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",
@@ -565,6 +700,22 @@
     "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "dd3b1335-6ffc-4c8d-b45e-5bee9f3f48da",
+   "metadata": {},
+   "source": [
+    "TODO: get rid of these cells"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aaf6c9f9-6fe6-4259-bbc4-d8a18e55984c",
+   "metadata": {},
+   "source": [
+    "TODO: filter for only \"files_10k\" filers"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 67,
@@ -706,6 +857,14 @@
     "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9f7bebc3-8e79-48e9-9178-68c112bb8ee9",
+   "metadata": {},
+   "source": [
+    "TODO: import from config file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 36,
@@ -731,14 +890,6 @@
     "]"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "21b697b0-7d9e-452c-9b8b-ee40fd6bb7bd",
-   "metadata": {},
-   "source": [
-    "create list column for address information as well?"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 55,
@@ -759,6 +910,14 @@
     "sec_match_df = sec_clean_df[SHARED_COLS]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "13bda908-2007-4bca-86ad-1bcf74b1b1ef",
+   "metadata": {},
+   "source": [
+    "TODO: import from config"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 43,
@@ -789,770 +948,79 @@
     }
    ],
    "source": [
-    "# duplicates exist because of differing report years\n",
-    "eia_match_df.duplicated(subset=match_cols).value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True     168445\n",
-       "False     64515\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sec_match_df.duplicated(subset=match_cols).value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "standard_industrial_classification\n",
-       "asset-backed securities [6189]          20311\n",
-       "pharmaceutical preparations [2834]       8530\n",
-       "state commercial banks [6022]            7886\n",
-       "real estate investment trusts [6798]     7706\n",
-       "services-prepackaged software [7372]     6007\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# could try to use keywords like gas, electricity, utility etc.\n",
-    "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 165,
-   "id": "c1500344-ff7f-450e-90dd-1105d8e7c637",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# run the Ex.21 to SEC model\n",
-    "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n",
-    "with open(filepath, 'r') as file:\n",
-    "    sec_ex21_settings = json.load(file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 192,
-   "id": "172ea84f-a0b7-4e9c-b746-322a47663171",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 193,
-   "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 194,
-   "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "14125"
-      ]
-     },
-     "execution_count": 194,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(sec_test_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 195,
-   "id": "ec13db12-3664-4e00-aa83-7c372039b230",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "233101"
-      ]
-     },
-     "execution_count": 195,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(ex21_test_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 196,
-   "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>record_id</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>loc_of_incorporation</th>\n",
-       "      <th>company_name_mphone</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>23</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>nicholas financial incorporated</td>\n",
-       "      <td>florida</td>\n",
-       "      <td>NXLS FNNXL INKRPRTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>24</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>nicholas financial incorporated</td>\n",
-       "      <td>florida</td>\n",
-       "      <td>NXLS FNNXL INKRPRTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>68</th>\n",
-       "      <td>68</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>sandisk corporation</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>SNTSK KRPRXN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    record_id  report_year                     company_name loc_of_incorporation  company_name_mphone\n",
-       "23         23         2016  nicholas financial incorporated              florida  NXLS FNNXL INKRPRTT\n",
-       "24         24         2017  nicholas financial incorporated              florida  NXLS FNNXL INKRPRTT\n",
-       "68         68         2016              sandisk corporation             delaware         SNTSK KRPRXN"
-      ]
-     },
-     "execution_count": 196,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sec_test_df.head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 197,
-   "id": "e24e2c8f-1124-4e87-b77d-55fca14a7d3c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>record_id</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>loc_of_incorporation</th>\n",
-       "      <th>company_name_mphone</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2832746</th>\n",
-       "      <td>0</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>capstone turbine singapore pte., limited</td>\n",
-       "      <td>singapore</td>\n",
-       "      <td>KPSTN TRBN SNKPR PT LMTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2832747</th>\n",
-       "      <td>1</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>capstone turbine international, incorporated</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KPSTN TRBN INTRNXNL INKRPRTT</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2832748</th>\n",
-       "      <td>2</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>capstone turbine financial services, limited l...</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         record_id  report_year                                       company_name loc_of_incorporation                    company_name_mphone\n",
-       "2832746          0         2016           capstone turbine singapore pte., limited            singapore               KPSTN TRBN SNKPR PT LMTT\n",
-       "2832747          1         2016       capstone turbine international, incorporated             delaware           KPSTN TRBN INTRNXNL INKRPRTT\n",
-       "2832748          2         2016  capstone turbine financial services, limited l...             delaware  KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN"
-      ]
-     },
-     "execution_count": 197,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ex21_test_df.head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 198,
-   "id": "c531657f-5a0a-4ff5-b680-c6a1806feb75",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# can we just load this linker and make predictions? what happens with blocking?\n",
-    "sec_ex21_linker = Linker([sec_test_df, ex21_test_df], sec_ex21_settings, db_api=DuckDBAPI())"
+    "# duplicates exist because of differing report years\n",
+    "eia_match_df.duplicated(subset=match_cols).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 199,
-   "id": "14b239db-a816-428c-a132-dca0ed0998c4",
+   "execution_count": 52,
+   "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Blocking time: 0.44 seconds\n"
-     ]
-    },
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "661a74c00c7e41f59787cad30a26ec78",
-       "version_major": 2,
-       "version_minor": 0
-      },
       "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+       "True     168445\n",
+       "False     64515\n",
+       "Name: count, dtype: int64"
       ]
      },
+     "execution_count": 52,
      "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Predict time: 115.79 seconds\n"
-     ]
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "sec_ex21_preds = sec_ex21_linker.inference.predict(threshold_match_probability=0.6)"
+    "sec_match_df.duplicated(subset=match_cols).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 200,
-   "id": "08167db9-9d9c-4b09-a839-847f85842324",
+   "execution_count": 57,
+   "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sec_ex21_preds_df = sec_ex21_preds.as_pandas_dataframe()"
+    "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 201,
-   "id": "3f349a0a-269a-4f34-95e8-54a8c96c57f8",
+   "execution_count": 39,
+   "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>match_weight</th>\n",
-       "      <th>match_probability</th>\n",
-       "      <th>source_dataset_l</th>\n",
-       "      <th>source_dataset_r</th>\n",
-       "      <th>record_id_l</th>\n",
-       "      <th>record_id_r</th>\n",
-       "      <th>company_name_l</th>\n",
-       "      <th>company_name_r</th>\n",
-       "      <th>gamma_company_name</th>\n",
-       "      <th>tf_company_name_l</th>\n",
-       "      <th>tf_company_name_r</th>\n",
-       "      <th>bf_company_name</th>\n",
-       "      <th>bf_tf_adj_company_name</th>\n",
-       "      <th>loc_of_incorporation_l</th>\n",
-       "      <th>loc_of_incorporation_r</th>\n",
-       "      <th>gamma_loc_of_incorporation</th>\n",
-       "      <th>tf_loc_of_incorporation_l</th>\n",
-       "      <th>tf_loc_of_incorporation_r</th>\n",
-       "      <th>bf_loc_of_incorporation</th>\n",
-       "      <th>bf_tf_adj_loc_of_incorporation</th>\n",
-       "      <th>company_name_mphone_l</th>\n",
-       "      <th>company_name_mphone_r</th>\n",
-       "      <th>report_year_l</th>\n",
-       "      <th>report_year_r</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>11.726954</td>\n",
-       "      <td>0.999705</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>95551</td>\n",
-       "      <td>5939</td>\n",
-       "      <td>pendrell corporation</td>\n",
-       "      <td>pentzer corporation</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>washington</td>\n",
-       "      <td>washington</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.003427</td>\n",
-       "      <td>0.003427</td>\n",
-       "      <td>2.321780</td>\n",
-       "      <td>60.034545</td>\n",
-       "      <td>PNTRL KRPRXN</td>\n",
-       "      <td>PNTSR KRPRXN</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0.981720</td>\n",
-       "      <td>0.663845</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>80041</td>\n",
-       "      <td>1485</td>\n",
-       "      <td>spok holdings, incorporated</td>\n",
-       "      <td>autohaus holdings, incorporated</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>2126.980572</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>2.321780</td>\n",
-       "      <td>0.580388</td>\n",
-       "      <td>SPK HLTNKS INKRPRTT</td>\n",
-       "      <td>ATHS HLTNKS INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>4.604002</td>\n",
-       "      <td>0.960504</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>72068</td>\n",
-       "      <td>2731</td>\n",
-       "      <td>ashford hospitality trust incorporated</td>\n",
-       "      <td>ashford hospitality trust, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>maryland</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>0.010087</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>AXFRT HSPTLT TRST INKRPRTT</td>\n",
-       "      <td>AXFRT HSPTLT TRST INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3.901062</td>\n",
-       "      <td>0.937263</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>58652</td>\n",
-       "      <td>1115</td>\n",
-       "      <td>tx holdings, incorporated</td>\n",
-       "      <td>tex holdings, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>georgia</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.005596</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.614319</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>TKS HLTNKS INKRPRTT</td>\n",
-       "      <td>TKS HLTNKS INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4.604002</td>\n",
-       "      <td>0.960504</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>82946</td>\n",
-       "      <td>1757</td>\n",
-       "      <td>pharma bio serv, incorporated</td>\n",
-       "      <td>pharma bio serv us, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>FRM B SRF INKRPRTT</td>\n",
-       "      <td>FRM B SRF US INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9343</th>\n",
-       "      <td>0.981720</td>\n",
-       "      <td>0.663845</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>248688</td>\n",
-       "      <td>1135</td>\n",
-       "      <td>transenterix incorporated</td>\n",
-       "      <td>trane brands, incorporated</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>2126.980572</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>2.321780</td>\n",
-       "      <td>0.580388</td>\n",
-       "      <td>TRNSNTRKS INKRPRTT</td>\n",
-       "      <td>TRN BRNTS INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9344</th>\n",
-       "      <td>3.901062</td>\n",
-       "      <td>0.937263</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>260283</td>\n",
-       "      <td>3506</td>\n",
-       "      <td>cree incorporated</td>\n",
-       "      <td>j.crew incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000004</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>north carolina</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.004926</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.614319</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>KR INKRPRTT</td>\n",
-       "      <td>JKR INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9345</th>\n",
-       "      <td>0.981720</td>\n",
-       "      <td>0.663845</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>232258</td>\n",
-       "      <td>3973</td>\n",
-       "      <td>applied minerals, incorporated</td>\n",
-       "      <td>applied materials spv2, incorporated</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>2126.980572</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>2.321780</td>\n",
-       "      <td>0.580388</td>\n",
-       "      <td>APLT MNRLS INKRPRTT</td>\n",
-       "      <td>APLT MTRLS SPF INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2016</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9346</th>\n",
-       "      <td>3.901062</td>\n",
-       "      <td>0.937263</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>232258</td>\n",
-       "      <td>3970</td>\n",
-       "      <td>applied minerals, incorporated</td>\n",
-       "      <td>applied materials japan, incorporated</td>\n",
-       "      <td>3</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>35295.437753</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>japan</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.005795</td>\n",
-       "      <td>0.614319</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>APLT MNRLS INKRPRTT</td>\n",
-       "      <td>APLT MTRLS JPN INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2016</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9347</th>\n",
-       "      <td>2.724934</td>\n",
-       "      <td>0.868616</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>267563</td>\n",
-       "      <td>285</td>\n",
-       "      <td>guess incorporated</td>\n",
-       "      <td>aquesys, incorporated</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>0.000008</td>\n",
-       "      <td>2126.980572</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>us delaware</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.354513</td>\n",
-       "      <td>0.000462</td>\n",
-       "      <td>4.511276</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>KS INKRPRTT</td>\n",
-       "      <td>AKSS INKRPRTT</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>2016</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>9348 rows × 24 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "      match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                          company_name_l                           company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r  gamma_loc_of_incorporation  tf_loc_of_incorporation_l  tf_loc_of_incorporation_r  bf_loc_of_incorporation  bf_tf_adj_loc_of_incorporation       company_name_mphone_l       company_name_mphone_r  report_year_l  report_year_r\n",
-       "0        11.726954           0.999705  __splink__input_table_0  __splink__input_table_1        95551         5939                    pendrell corporation                      pentzer corporation                   3           0.000008           0.000004     35295.437753                     1.0             washington             washington                           3                   0.003427                   0.003427                 2.321780                       60.034545                PNTRL KRPRXN                PNTSR KRPRXN           2017           2017\n",
-       "1         0.981720           0.663845  __splink__input_table_0  __splink__input_table_1        80041         1485             spok holdings, incorporated          autohaus holdings, incorporated                   2           0.000008           0.000004      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388         SPK HLTNKS INKRPRTT        ATHS HLTNKS INKRPRTT           2017           2017\n",
-       "2         4.604002           0.960504  __splink__input_table_0  __splink__input_table_1        72068         2731  ashford hospitality trust incorporated  ashford hospitality trust, incorporated                   3           0.000008           0.000004     35295.437753                     1.0               maryland                   None                          -1                   0.010087                        NaN                 1.000000                        1.000000  AXFRT HSPTLT TRST INKRPRTT  AXFRT HSPTLT TRST INKRPRTT           2017           2017\n",
-       "3         3.901062           0.937263  __splink__input_table_0  __splink__input_table_1        58652         1115               tx holdings, incorporated               tex holdings, incorporated                   3           0.000008           0.000004     35295.437753                     1.0                georgia               delaware                           0                   0.005596                   0.354513                 0.614319                        1.000000         TKS HLTNKS INKRPRTT         TKS HLTNKS INKRPRTT           2017           2017\n",
-       "4         4.604002           0.960504  __splink__input_table_0  __splink__input_table_1        82946         1757           pharma bio serv, incorporated         pharma bio serv us, incorporated                   3           0.000008           0.000004     35295.437753                     1.0                   None               delaware                          -1                        NaN                   0.354513                 1.000000                        1.000000          FRM B SRF INKRPRTT       FRM B SRF US INKRPRTT           2017           2017\n",
-       "...            ...                ...                      ...                      ...          ...          ...                                     ...                                      ...                 ...                ...                ...              ...                     ...                    ...                    ...                         ...                        ...                        ...                      ...                             ...                         ...                         ...            ...            ...\n",
-       "9343      0.981720           0.663845  __splink__input_table_0  __splink__input_table_1       248688         1135               transenterix incorporated               trane brands, incorporated                   2           0.000008           0.000004      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388          TRNSNTRKS INKRPRTT          TRN BRNTS INKRPRTT           2017           2017\n",
-       "9344      3.901062           0.937263  __splink__input_table_0  __splink__input_table_1       260283         3506                       cree incorporated                      j.crew incorporated                   3           0.000008           0.000004     35295.437753                     1.0         north carolina               delaware                           0                   0.004926                   0.354513                 0.614319                        1.000000                 KR INKRPRTT                JKR INKRPRTT           2017           2017\n",
-       "9345      0.981720           0.663845  __splink__input_table_0  __splink__input_table_1       232258         3973          applied minerals, incorporated     applied materials spv2, incorporated                   2           0.000008           0.000008      2126.980572                     1.0               delaware               delaware                           3                   0.354513                   0.354513                 2.321780                        0.580388         APLT MNRLS INKRPRTT     APLT MTRLS SPF INKRPRTT           2017           2016\n",
-       "9346      3.901062           0.937263  __splink__input_table_0  __splink__input_table_1       232258         3970          applied minerals, incorporated    applied materials japan, incorporated                   3           0.000008           0.000008     35295.437753                     1.0               delaware                  japan                           0                   0.354513                   0.005795                 0.614319                        1.000000         APLT MNRLS INKRPRTT     APLT MTRLS JPN INKRPRTT           2017           2016\n",
-       "9347      2.724934           0.868616  __splink__input_table_0  __splink__input_table_1       267563          285                      guess incorporated                    aquesys, incorporated                   2           0.000008           0.000008      2126.980572                     1.0               delaware            us delaware                           2                   0.354513                   0.000462                 4.511276                        1.000000                 KS INKRPRTT               AKSS INKRPRTT           2017           2016\n",
-       "\n",
-       "[9348 rows x 24 columns]"
+       "standard_industrial_classification\n",
+       "asset-backed securities [6189]          20311\n",
+       "pharmaceutical preparations [2834]       8530\n",
+       "state commercial banks [6022]            7886\n",
+       "real estate investment trusts [6798]     7706\n",
+       "services-prepackaged software [7372]     6007\n",
+       "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 201,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# TODO: this needs to be improved, maybe just do a fuzzy match on string name?\n",
-    "sec_ex21_preds_df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "defdf953-4af7-4d43-b7cf-5ae95360d70f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add the Ex. 21 subsidiaries that don't get a matching CIK to the SEC side\n",
-    "# run on all the data\n",
-    "# save the mapping of subsidiaries that are greater than a certain threshold (unclear why the blocking isn't working)\n",
-    "# get the subsidiaries that are less than a certain threshold\n",
-    "# transform them to have columns that match with the SEC df\n",
-    "# add them to the SEC side"
+    "# could try to use keywords like gas, electricity, utility etc.\n",
+    "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)"
    ]
   },
   {
@@ -1947,20 +1415,11 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 66,
-   "id": "6402e556-b87c-47ca-bc30-ced2b42e6626",
+   "cell_type": "markdown",
+   "id": "5d0b403f-8a1a-4ee2-89db-f274f6a55bbd",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# probably shouldn't be blocking on report year, because we don't care that much \n",
-    "# about report year lining up\n",
-    "# try overlap between tokens in address or company name\n",
-    "br0 = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"\n",
-    "br1 = \"l.street_address = r.street_address\"\n",
-    "br2 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\"\n",
-    "# br3 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.zip_code = r.zip_code\"\n",
-    "br3 = \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\""
+    "TODO: import BLOCKING RULES from config"
    ]
   },
   {
@@ -1987,7 +1446,7 @@
    "source": [
     "counts = count_comparisons_from_blocking_rule(\n",
     "    table_or_tables=[sec_match_df, eia_match_df],\n",
-    "    blocking_rule=br0,\n",
+    "    blocking_rule=BLOCKING_RULES[0],\n",
     "    link_type=\"link_only\",\n",
     "    unique_id_column_name='record_id',\n",
     "    db_api=db_api,\n",
@@ -2070,7 +1529,7 @@
    "source": [
     "result = n_largest_blocks(\n",
     "    table_or_tables=[sec_match_df, eia_match_df],\n",
-    "    blocking_rule=br0,\n",
+    "    blocking_rule=BLOCKING_RULES[0],\n",
     "    link_type=\"link_only\",\n",
     "    db_api=db_api,\n",
     "    n_largest=3\n",
@@ -2179,14 +1638,9 @@
     }
    ],
    "source": [
-    "blocking_rules_for_analysis = [\n",
-    "    br0, br1, br2, br3\n",
-    "]\n",
-    "\n",
-    "\n",
     "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n",
     "    table_or_tables=[sec_match_df, eia_match_df],\n",
-    "    blocking_rules=blocking_rules_for_analysis,\n",
+    "    blocking_rules=BLOCKING_RULES,\n",
     "    db_api=db_api,\n",
     "    unique_id_column_name='record_id',\n",
     "    link_type=\"link_only\",\n",
@@ -2201,6 +1655,14 @@
     "## Create Model"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "d35162e9-f671-4e99-a261-e1bd4d16717e",
+   "metadata": {},
+   "source": [
+    "TODO: import comparisons from config"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 334,
@@ -2373,7 +1835,7 @@
     "        city_comparison\n",
     "    ],\n",
     "    blocking_rules_to_generate_predictions=[\n",
-    "        br0, br1, br2, br3\n",
+    "        BLOCKING_RULES\n",
     "    ],\n",
     "    retain_intermediate_calculation_columns=True,\n",
     ")\n",
@@ -2381,6 +1843,14 @@
     "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "04fda31f-fcea-446e-813a-08617d7a43bf",
+   "metadata": {},
+   "source": [
+    "TODO: import deterministic rules"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 453,
@@ -5292,10 +4762,29 @@
     "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "ad4d3859-81d1-4fa8-98cc-ff7c9fd038f6",
+   "metadata": {},
+   "source": [
+    "# Match to Ex. 21 subsidiaries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "d1c56b09-80c7-4bfe-b1ec-c0220cadafbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# match EIA records that don't have a prediction to EIA subsidiaries\n",
+    "# can reuse code from SEC module?"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "288ffe20-c69e-4c96-8835-765c06303bf2",
+   "id": "a5599b7a-ea9a-40fd-9ce1-cb79a8d4dc35",
    "metadata": {},
    "outputs": [],
    "source": []

From 7dc78e19b9cda4eda7625bd2433bd06cd39aa05e Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 4 Dec 2024 15:35:58 -0500
Subject: [PATCH 147/161] Fix dagster setup for record linkage inputs

---
 .pre-commit-config.yaml                       |  1 -
 environment.yml                               |  1 -
 .../models/sec_eia_record_linkage/__init__.py | 19 +++++++++++++++++--
 .../transform_sec_input.py                    | 14 +++++---------
 workspace.yaml                                |  1 +
 5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2aaf16a..7516290 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,6 @@ repos:
     rev: 24.10.0
     hooks:
       - id: black
-        language_version: python3.11
 
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
diff --git a/environment.yml b/environment.yml
index a902ea3..33b1e04 100644
--- a/environment.yml
+++ b/environment.yml
@@ -30,5 +30,4 @@ dependencies:
   # Use pip to install the package defined by this repo for development:
   - pip:
       # - git+https://github.com/catalyst-cooperative/pudl.git@main
-      - -e /Users/katielamb/CatalystCoop/pudl
       - --editable ./[dev,docs,tests,types]
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
index c87c0cb..932b5f8 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -1,6 +1,6 @@
 """Implement record linkage model between SEC companies and EIA utilities."""
 
-from dagster import Definitions, load_assets_from_modules
+from dagster import AssetKey, AssetSpec, Definitions, load_assets_from_modules
 from dagstermill import (
     ConfigurableLocalOutputNotebookIOManager,
 )
@@ -18,6 +18,7 @@
 )
 from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource
 
+from ..sec10k.extract import year_quarter_partitions
 from . import transform_eia_input, transform_sec_input
 
 eia_assets = load_assets_from_modules([transform_eia_input])
@@ -30,8 +31,22 @@
     "sec_input_table_creation", transform_sec_input.production_assets
 )
 
+basic_10k_company_info = AssetSpec(
+    key=AssetKey("basic_10k_company_info")
+).with_io_manager_key("pandas_parquet_io_manager")
+
+ex21_company_ownership_info = AssetSpec(
+    key=AssetKey("ex21_company_ownership_info"), partitions_def=year_quarter_partitions
+).with_io_manager_key("pandas_parquet_io_manager")
+
+sec10k_filing_metadata = AssetSpec(
+    key=AssetKey("sec10k_filing_metadata"), partitions_def=year_quarter_partitions
+).with_io_manager_key("io_manager")
+
 defs = Definitions(
-    sec_assets,
+    sec_assets
+    + eia_assets
+    + [basic_10k_company_info, ex21_company_ownership_info, sec10k_filing_metadata],
     jobs=[eia_input_table_production_job, sec_input_table_production_job],
     resources={
         "cloud_interface": cloud_interface_resource,
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index 7c4aab0..82c891e 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from dagster import AssetIn, AssetOut, multi_asset
+from dagster import AllPartitionMapping, AssetIn, AssetOut, multi_asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
     fill_street_address_nulls,
@@ -18,11 +18,6 @@
 )
 from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
 
-from ..sec10k.extract import (
-    sec10k_filing_metadata,
-    year_quarter_partitions,
-)
-
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
@@ -264,13 +259,14 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame:
 @multi_asset(
     ins={
         "ex21_df": AssetIn("ex21_company_ownership_info"),
+        "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"),
     },
     outs={
         "transformed_ex21_subsidiary_table": AssetOut(
             io_manager_key="pandas_parquet_io_manager",
         )
     },
-    partitions_def=year_quarter_partitions,
+    partitions_def=AllPartitionMapping(),
 )
 def transform_ex21_table(
     ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
@@ -323,6 +319,7 @@ def transform_basic10k_table(
     ins={
         "basic_10k_df": AssetIn("basic_10k_company_info"),
         "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
+        "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"),
         # specify an io_manager_key?
     },
     outs={
@@ -331,7 +328,6 @@ def transform_basic10k_table(
             # specify a dagster_type?
         ),
     },
-    partitions_def=year_quarter_partitions,
 )
 def sec_rl_input_table(
     basic_10k_df: pd.DataFrame,
@@ -368,4 +364,4 @@ def sec_rl_input_table(
     return out_df
 
 
-production_assets = [sec_rl_input_table, sec10k_filing_metadata]
+production_assets = [sec_rl_input_table, transform_ex21_table]
diff --git a/workspace.yaml b/workspace.yaml
index 144aada..a208373 100644
--- a/workspace.yaml
+++ b/workspace.yaml
@@ -1,2 +1,3 @@
 load_from:
   - python_module: mozilla_sec_eia.models.sec10k
+  - python_module: mozilla_sec_eia.models.sec_eia_record_linkage

From daa8f0aafa624c9bf6ad55443738c265f3f11ba3 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 9 Dec 2024 14:08:34 -0800
Subject: [PATCH 148/161] fix util functions

---
 src/mozilla_sec_eia/library/record_linkage_utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py
index 9a33392..f899bf3 100644
--- a/src/mozilla_sec_eia/library/record_linkage_utils.py
+++ b/src/mozilla_sec_eia/library/record_linkage_utils.py
@@ -88,11 +88,13 @@ def get_metaphone_col(col: pd.Series) -> pd.Series:
     return col.apply(jellyfish.metaphone)
 
 
-def transform_company_name(df: pd.DataFrame) -> pd.DataFrame:
+def transform_company_name(
+    df: pd.DataFrame, col_name: str = "company_name"
+) -> pd.DataFrame:
     """Apply cleaning, get metaphone col, drop invalid rows."""
-    df = clean_company_name(df)
-    df.loc[:, "company_name_mphone"] = get_metaphone_col(df["company_name_no_legal"])
-    df = drop_invalid_names(df, "company_name_clean")
+    df = clean_company_name(df, col_name=col_name)
+    df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"])
+    df = drop_invalid_names(df, col_name)
     return df
 
 
@@ -102,7 +104,7 @@ def fill_street_address_nulls(
     secondary_address_col: str = "street_address_2",
 ) -> pd.DataFrame:
     """Fill null street address with value from secondary address column."""
-    df[address_col] = pd.where(
+    df[address_col] = df[address_col].where(
         (~df[address_col].isnull()) | (df[secondary_address_col].isnull()),
         df[secondary_address_col],
     )

From dbefe3426af2c867459bfb02c9ef393570a90616 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 10 Dec 2024 13:46:41 -0500
Subject: [PATCH 149/161] Handle missing partitions in extracted data

---
 .../models/sec_eia_record_linkage/__init__.py | 22 +++++++++--
 .../transform_sec_input.py                    | 37 ++++++++-----------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
index 932b5f8..bbcfa2f 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -1,6 +1,12 @@
 """Implement record linkage model between SEC companies and EIA utilities."""
 
-from dagster import AssetKey, AssetSpec, Definitions, load_assets_from_modules
+from dagster import (
+    AssetKey,
+    AssetSpec,
+    Definitions,
+    StaticPartitionsDefinition,
+    load_assets_from_modules,
+)
 from dagstermill import (
     ConfigurableLocalOutputNotebookIOManager,
 )
@@ -35,12 +41,22 @@
     key=AssetKey("basic_10k_company_info")
 ).with_io_manager_key("pandas_parquet_io_manager")
 
+# Create year_quarter partitions
+completed_partitions = StaticPartitionsDefinition(
+    [
+        year_quarter
+        for year_quarter in year_quarter_partitions.get_partition_keys()
+        if year_quarter
+        not in ["2018q1", "2018q2", "2019q1", "2020q1", "2021q1", "2022q1"]
+    ]
+)
+
 ex21_company_ownership_info = AssetSpec(
-    key=AssetKey("ex21_company_ownership_info"), partitions_def=year_quarter_partitions
+    key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions
 ).with_io_manager_key("pandas_parquet_io_manager")
 
 sec10k_filing_metadata = AssetSpec(
-    key=AssetKey("sec10k_filing_metadata"), partitions_def=year_quarter_partitions
+    key=AssetKey("sec10k_filing_metadata"), partitions_def=completed_partitions
 ).with_io_manager_key("io_manager")
 
 defs = Definitions(
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index 82c891e..2e8fa96 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from dagster import AllPartitionMapping, AssetIn, AssetOut, multi_asset
+from dagster import AssetIn, asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
     fill_street_address_nulls,
@@ -256,22 +256,20 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame:
     return ex21_df
 
 
-@multi_asset(
+@asset(
     ins={
-        "ex21_df": AssetIn("ex21_company_ownership_info"),
-        "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"),
-    },
-    outs={
-        "transformed_ex21_subsidiary_table": AssetOut(
-            io_manager_key="pandas_parquet_io_manager",
-        )
+        "ex21_dfs": AssetIn("ex21_company_ownership_info"),
+        "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
     },
-    partitions_def=AllPartitionMapping(),
 )
-def transform_ex21_table(
-    ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame
+def transformed_ex21_subsidiary_table(
+    ex21_dfs: dict[str, pd.DataFrame],
+    sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
 ) -> pd.DataFrame:
     """Transform Ex. 21 table of subsidiaries before combining with basic 10k table."""
+    ex21_df = pd.concat(ex21_dfs.values())
+    sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
+
     ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df)
     ex21_df = ex21_df.drop(columns=["id"])
     ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
@@ -315,21 +313,15 @@ def transform_basic10k_table(
     return basic_10k_df
 
 
-@multi_asset(
+@asset(
     ins={
         "basic_10k_df": AssetIn("basic_10k_company_info"),
         "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
         "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"),
         # specify an io_manager_key?
     },
-    outs={
-        "core_sec_10k__parents_and_subsidiaries": AssetOut(
-            io_manager_key="pandas_parquet_io_manager",
-            # specify a dagster_type?
-        ),
-    },
 )
-def sec_rl_input_table(
+def core_sec_10k__parents_and_subsidiaries(
     basic_10k_df: pd.DataFrame,
     clean_ex21_df: pd.DataFrame,
     sec10k_filing_metadata: pd.DataFrame,
@@ -364,4 +356,7 @@ def sec_rl_input_table(
     return out_df
 
 
-production_assets = [sec_rl_input_table, transform_ex21_table]
+production_assets = [
+    core_sec_10k__parents_and_subsidiaries,
+    transformed_ex21_subsidiary_table,
+]

From 97f5d68af4faff270192fcc067d59ee6bdec4eb1 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Wed, 11 Dec 2024 15:57:52 -0500
Subject: [PATCH 150/161] Fix basic_10k partitions

---
 .../models/sec_eia_record_linkage/__init__.py             | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
index bbcfa2f..3350449 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py
@@ -37,10 +37,6 @@
     "sec_input_table_creation", transform_sec_input.production_assets
 )
 
-basic_10k_company_info = AssetSpec(
-    key=AssetKey("basic_10k_company_info")
-).with_io_manager_key("pandas_parquet_io_manager")
-
 # Create year_quarter partitions
 completed_partitions = StaticPartitionsDefinition(
     [
@@ -51,6 +47,10 @@
     ]
 )
 
+basic_10k_company_info = AssetSpec(
+    key=AssetKey("basic_10k_company_info"), partitions_def=completed_partitions
+).with_io_manager_key("pandas_parquet_io_manager")
+
 ex21_company_ownership_info = AssetSpec(
     key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions
 ).with_io_manager_key("pandas_parquet_io_manager")

From b26f1f8c907d41c0a643a3cfc41f876cb1d23bd9 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 11 Dec 2024 15:15:07 -0800
Subject: [PATCH 151/161] debug materialization of rl input assets

---
 .../library/record_linkage_utils.py           | 36 +++++++++++++++----
 .../transform_sec_input.py                    | 12 ++++---
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py
index f899bf3..924b6a4 100644
--- a/src/mozilla_sec_eia/library/record_linkage_utils.py
+++ b/src/mozilla_sec_eia/library/record_linkage_utils.py
@@ -1,5 +1,7 @@
 """Utility functions for cleaning strings during modeling preprocessing steps."""
 
+from enum import StrEnum
+
 import jellyfish
 import pandas as pd
 
@@ -75,11 +77,18 @@ def clean_company_name(
     return df
 
 
-def drop_invalid_names(
-    df: pd.DataFrame, col_name: str = "company_name"
+def handle_invalid_names(
+    df: pd.DataFrame, col_name: str = "company_name", drop_invalid: bool = True
 ) -> pd.DataFrame:
-    """Drop rows that have invalid company names, like just 'llc', or 'partnership'."""
-    return df[(~df[col_name].isin(INVALID_NAMES))]
+    """Drop rows that have invalid company names, like just 'llc', or 'partnership'.
+
+    Either drop invalid company name values or fill with the empty string. Invalid
+    values are contained in `INVALID_NAMES`.
+    """
+    if drop_invalid:
+        return df[(~df[col_name].isin(INVALID_NAMES))]
+    df[col_name] = df[col_name].where(~df[col_name].isin(INVALID_NAMES), "")
+    return df
 
 
 # TODO: this is in PUDL, deduplicate
@@ -88,13 +97,28 @@ def get_metaphone_col(col: pd.Series) -> pd.Series:
     return col.apply(jellyfish.metaphone)
 
 
+class HandleNulls(StrEnum):
+    """Enum for handling null values in company name transform."""
+
+    DROP = "drop"
+    FILL_EMPTY_STR = "fill_empty_str"
+
+
 def transform_company_name(
-    df: pd.DataFrame, col_name: str = "company_name"
+    df: pd.DataFrame,
+    col_name: str = "company_name",
+    handle_nulls: HandleNulls = HandleNulls.DROP,
 ) -> pd.DataFrame:
     """Apply cleaning, get metaphone col, drop invalid rows."""
     df = clean_company_name(df, col_name=col_name)
+    if handle_nulls == HandleNulls.DROP:
+        df = handle_invalid_names(df, col_name, drop_invalid=True)
+        df = df[~df[col_name].isnull()]
+    elif handle_nulls == HandleNulls.FILL_EMPTY_STR:
+        df = handle_invalid_names(df, col_name, drop_invalid=False)
+        df = df.fillna({col_name: ""})
     df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"])
-    df = drop_invalid_names(df, col_name)
+
     return df
 
 
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index 2e8fa96..ff88151 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -21,7 +21,7 @@
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
 
-EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"}
+EX21_COL_MAP = {"subsidiary": "company_name", "loc": "location_of_inc"}
 SEC_COL_MAP = {
     "company_conformed_name": "company_name",
     "street_1": "street_address",
@@ -315,16 +315,16 @@ def transform_basic10k_table(
 
 @asset(
     ins={
-        "basic_10k_df": AssetIn("basic_10k_company_info"),
+        "basic_10k_dfs": AssetIn("basic_10k_company_info"),
         "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
-        "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"),
+        "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
         # specify an io_manager_key?
     },
 )
 def core_sec_10k__parents_and_subsidiaries(
-    basic_10k_df: pd.DataFrame,
+    basic_10k_dfs: dict[str, pd.DataFrame],
     clean_ex21_df: pd.DataFrame,
-    sec10k_filing_metadata: pd.DataFrame,
+    sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
 ) -> pd.DataFrame:
     """Asset for creating an SEC 10K output table.
 
@@ -333,6 +333,8 @@ def core_sec_10k__parents_and_subsidiaries(
     filing companies. Create an sec_company_id for subsidiaries that aren't linked
     to a CIK.
     """
+    basic_10k_df = pd.concat(basic_10k_dfs.values())
+    sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
     basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata)
     ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
         basic10k_df=basic_10k_df, ex21_df=clean_ex21_df

From acaf3d1866a6db34731c9c855a4baa34476af955 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 16 Dec 2024 11:59:26 -0800
Subject: [PATCH 152/161] clean up notebook to work with dagster assets

---
 notebooks/18-kl-splink-sec-eia.ipynb          | 4000 +++++++----------
 .../library/record_linkage_utils.py           |   35 +
 .../sec_eia_record_linkage/preprocessing.py   |  167 -
 .../sec_eia_splink_config.py                  |    5 +-
 .../transform_eia_input.py                    |   34 +-
 .../transform_sec_input.py                    |  105 +-
 .../street_suffix_abbreviations.json          |  203 +
 7 files changed, 1875 insertions(+), 2674 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
 create mode 100644 src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 2fdeb79..8de5812 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -31,8 +31,16 @@
     "from splink.exploratory import completeness_chart, profile_columns\n",
     "from upath import UPath\n",
     "\n",
-    "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n",
-    "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import add_sec_company_id_to_subsidiaries, prepare_sec10k_basic_info_df, prepare_eia_df, prepare_ex21_df"
+    "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n",
+    "    BLOCKING_RULES,\n",
+    "    MATCH_COLS,\n",
+    "    SHARED_COLS,\n",
+    "    address_comparison,\n",
+    "    city_comparison,\n",
+    "    company_name_comparison,\n",
+    "    deterministic_blocking_rules,\n",
+    "    state_comparison\n",
+    ")"
    ]
   },
   {
@@ -51,37 +59,19 @@
     "### EIA"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "13d543e7-334c-4606-849b-c8d60ad668d2",
-   "metadata": {},
-   "source": [
-    "TODO: materialize asset and read in from Dagster GCS storage"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "7f3e5fdd-2c16-4dc0-8ad1-cf4516fbee33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mozilla_sec_eia.models.sec_eia_record_linkage.create_eia_input import get_eia_utilities_table"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "70ebf6dc-ed00-4f78-bbaf-2805860a1b63",
+   "execution_count": 3,
+   "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "eia_df = get_eia_utilities_table()"
+    "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 4,
    "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a",
    "metadata": {},
    "outputs": [
@@ -106,11 +96,13 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>street_address</th>\n",
        "      <th>utility_id_eia</th>\n",
        "      <th>utility_id_pudl</th>\n",
-       "      <th>utility_name_eia</th>\n",
+       "      <th>company_name_raw</th>\n",
        "      <th>report_date</th>\n",
-       "      <th>street_address</th>\n",
        "      <th>city</th>\n",
        "      <th>state</th>\n",
        "      <th>zip_code</th>\n",
@@ -120,7 +112,7 @@
        "      <th>plants_reported_other_relationship</th>\n",
        "      <th>entity_type</th>\n",
        "      <th>attention_line</th>\n",
-       "      <th>address_2</th>\n",
+       "      <th>street_address_2</th>\n",
        "      <th>zip_code_4</th>\n",
        "      <th>contact_firstname</th>\n",
        "      <th>contact_lastname</th>\n",
@@ -133,15 +125,70 @@
        "      <th>phone_number_2</th>\n",
        "      <th>phone_extension_2</th>\n",
        "      <th>data_maturity</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name_no_legal</th>\n",
+       "      <th>company_name_mphone</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>66550</td>\n",
-       "      <td>16573.0</td>\n",
-       "      <td>Telyon AMZ Windsor LLC</td>\n",
-       "      <td>2024-01-01</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0ham wham8 solar limited liability company</td>\n",
+       "      <td>100 california st suite 400</td>\n",
+       "      <td>64380</td>\n",
+       "      <td>8321.0</td>\n",
+       "      <td>0ham wham8 solar, llc</td>\n",
+       "      <td>2023-01-01</td>\n",
+       "      <td>san francisco</td>\n",
+       "      <td>ca</td>\n",
+       "      <td>94118</td>\n",
+       "      <td>True</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Q</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>0ham wham8 solar</td>\n",
+       "      <td>HM HM SLR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>10 briggs solar ng limited liability company</td>\n",
+       "      <td>267 water st 2nd floor</td>\n",
+       "      <td>62685</td>\n",
+       "      <td>8502.0</td>\n",
+       "      <td>10 briggs solar ng, llc</td>\n",
+       "      <td>2020-01-01</td>\n",
+       "      <td>warren</td>\n",
+       "      <td>ri</td>\n",
+       "      <td>02885</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Q</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
@@ -149,9 +196,28 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>10 briggs solar ng</td>\n",
+       "      <td>BRKS SLR NK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1001 ebenezer church solar limited liability c...</td>\n",
+       "      <td>176 ebenezer church rd</td>\n",
+       "      <td>63186</td>\n",
+       "      <td>8567.0</td>\n",
+       "      <td>1001 ebenezer church solar, llc</td>\n",
+       "      <td>2020-01-01</td>\n",
+       "      <td>state road</td>\n",
+       "      <td>nc</td>\n",
+       "      <td>28676</td>\n",
+       "      <td>True</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
+       "      <td>Q</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
@@ -164,82 +230,75 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>None</td>\n",
-       "      <td>monthly_update</td>\n",
+       "      <td>None</td>\n",
+       "      <td>final</td>\n",
+       "      <td>2020</td>\n",
+       "      <td>1001 ebenezer church solar</td>\n",
+       "      <td>EBNSR XRX SLR</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   utility_id_eia  utility_id_pudl        utility_name_eia report_date street_address  city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2   data_maturity\n",
-       "0           66550          16573.0  Telyon AMZ Windsor LLC  2024-01-01           None  None  None     None                  None                     None                          None                               None        None           None      None       None              None             None          None         None            None                None               None            None           None              None  monthly_update"
+       "   record_id                                       company_name               street_address  utility_id_eia  utility_id_pudl                 company_name_raw report_date           city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line street_address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity  report_year       company_name_no_legal company_name_mphone\n",
+       "0          0         0ham wham8 solar limited liability company  100 california st suite 400           64380           8321.0            0ham wham8 solar, llc  2023-01-01  san francisco    ca    94118                  True                     None                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2023            0ham wham8 solar           HM HM SLR\n",
+       "1          1       10 briggs solar ng limited liability company       267 water st 2nd floor           62685           8502.0          10 briggs solar ng, llc  2020-01-01         warren    ri    02885                  True                     True                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2020          10 briggs solar ng         BRKS SLR NK\n",
+       "2          2  1001 ebenezer church solar limited liability c...       176 ebenezer church rd           63186           8567.0  1001 ebenezer church solar, llc  2020-01-01     state road    nc    28676                  True                     None                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2020  1001 ebenezer church solar       EBNSR XRX SLR"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "eia_df.head(1)"
+    "eia_df.head(3)"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "20821"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "### SEC 10K Basic Info"
+    "len(eia_df)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "012db270-d944-464c-9d30-c5995ab491a4",
-   "metadata": {},
-   "source": [
-    "TODO: read in asset from Dagster GCS storage"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7",
+   "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "sec_path = UPath(\"gs://sec10k-outputs/v2/basic_10k_company_info\")"
+    "### SEC 10K Basic Info"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821",
+   "execution_count": 100,
+   "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "raw_sec_df = pd.DataFrame()\n",
-    "for file in sec_path.iterdir():\n",
-    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
-    "        raw_sec_df = pd.concat([raw_sec_df, pd.read_parquet(sec_path / file.name)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "1be3364e-9887-42b2-b303-0a24e8681acf",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n",
-    "raw_sec_df.columns.name = None"
+    "sec_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "5fcb05e5-6a57-439f-802f-527242f8f223",
+   "execution_count": 101,
+   "id": "a5ea9e1d-3afd-466f-a506-ecb3f23605c9",
    "metadata": {},
    "outputs": [
     {
@@ -263,13 +322,14 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>]fiscal_year_end</th>\n",
-       "      <th>]irs_number</th>\n",
-       "      <th>]state_of_incorporation</th>\n",
-       "      <th>business_phone</th>\n",
+       "      <th>record_id</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>street_address</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>phone_number</th>\n",
        "      <th>central_index_key</th>\n",
        "      <th>city</th>\n",
-       "      <th>company_conformed_name</th>\n",
+       "      <th>company_name_raw</th>\n",
        "      <th>date_of_name_change</th>\n",
        "      <th>film_number</th>\n",
        "      <th>fiscal_year_end</th>\n",
@@ -282,777 +342,344 @@
        "      <th>standard_industrial_classification</th>\n",
        "      <th>state</th>\n",
        "      <th>state_of_incorporation</th>\n",
-       "      <th>street_1</th>\n",
-       "      <th>street_2</th>\n",
-       "      <th>zip</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>filename</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
+       "      <th>street_address_2</th>\n",
+       "      <th>zip_code</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>location_of_inc</th>\n",
+       "      <th>company_name_no_legal</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "      <th>files_10k</th>\n",
+       "      <th>sec_company_id</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>edgar/data/1000015/0000912057-00-014793.txt</th>\n",
-       "      <td>NaN</td>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>024 pharma incorporated</td>\n",
+       "      <td>224 datura st</td>\n",
+       "      <td>edgar/data/1307969/0001683168-17-000653.txt</td>\n",
+       "      <td>(732) 696-9333</td>\n",
+       "      <td>0001307969</td>\n",
+       "      <td>west palm beach</td>\n",
+       "      <td>024 pharma, inc.</td>\n",
+       "      <td>20091202</td>\n",
+       "      <td>17711535</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>b green innovations, inc.</td>\n",
+       "      <td>201862731</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>1934 act</td>\n",
+       "      <td>333-120490</td>\n",
+       "      <td>plastics products, nec [3089]</td>\n",
+       "      <td>fl</td>\n",
+       "      <td>nj</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2039736700</td>\n",
-       "      <td>0001000015</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>meta group inc</td>\n",
+       "      <td>33401</td>\n",
+       "      <td>2017-03-24</td>\n",
+       "      <td>2017</td>\n",
+       "      <td>new jersey</td>\n",
+       "      <td>024 pharma</td>\n",
+       "      <td>FRM</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0001307969</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1 800 contacts incorporated</td>\n",
+       "      <td>13751 s wadsworth park dr suite d140</td>\n",
+       "      <td>edgar/data/1050122/0001104659-06-017311.txt</td>\n",
+       "      <td>8015728225</td>\n",
+       "      <td>0001050122</td>\n",
+       "      <td>draper</td>\n",
+       "      <td>1 800 contacts inc</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>585471</td>\n",
+       "      <td>06691791</td>\n",
        "      <td>1231</td>\n",
        "      <td>10-k</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>060971675</td>\n",
+       "      <td>870571643</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>1934 act</td>\n",
+       "      <td>000-23633</td>\n",
+       "      <td>retail-catalog &amp; mail-order houses [5961]</td>\n",
+       "      <td>ut</td>\n",
+       "      <td>de</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>000-27280</td>\n",
-       "      <td>services-engineering, accounting, research, ma...</td>\n",
-       "      <td>ct</td>\n",
+       "      <td>84020</td>\n",
+       "      <td>2006-03-16</td>\n",
+       "      <td>2006</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>1 800 contacts</td>\n",
+       "      <td>KNTKTS</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0001050122</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1 800 contacts incorporated</td>\n",
+       "      <td>66 e wadsworth park dr</td>\n",
+       "      <td>edgar/data/1050122/0001104659-07-019474.txt</td>\n",
+       "      <td>801-316-5000</td>\n",
+       "      <td>0001050122</td>\n",
+       "      <td>draper</td>\n",
+       "      <td>1 800 contacts inc</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>07696033</td>\n",
+       "      <td>1231</td>\n",
+       "      <td>10-k</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>870571643</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1934 act</td>\n",
+       "      <td>000-23633</td>\n",
+       "      <td>retail-catalog &amp; mail-order houses [5961]</td>\n",
+       "      <td>ut</td>\n",
        "      <td>de</td>\n",
-       "      <td>208 harbor dr</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>06912-0061</td>\n",
+       "      <td>84020</td>\n",
+       "      <td>2007-03-15</td>\n",
+       "      <td>2007</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>1 800 contacts</td>\n",
+       "      <td>KNTKTS</td>\n",
+       "      <td>True</td>\n",
+       "      <td>0001050122</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                            ]fiscal_year_end ]irs_number ]state_of_incorporation business_phone central_index_key      city company_conformed_name date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number                 standard_industrial_classification state state_of_incorporation       street_1 street_2         zip\n",
-       "filename                                                                                                                                                                                                                                                                                                                                                                                                                   \n",
-       "edgar/data/1000015/0000912057-00-014793.txt              NaN         NaN                     NaN     2039736700        0001000015  stamford         meta group inc                 NaN      585471            1231      10-k                   NaN  060971675               NaN     NaN       000-27280  services-engineering, accounting, research, ma...    ct                     de  208 harbor dr      NaN  06912-0061"
+       "   record_id                 company_name                        street_address                                     filename    phone_number central_index_key             city    company_name_raw date_of_name_change film_number fiscal_year_end form_type      former_conformed_name irs_number organization_name   sec_act sec_file_number         standard_industrial_classification state state_of_incorporation street_address_2 zip_code report_date  report_year location_of_inc company_name_no_legal company_name_mphone  files_10k sec_company_id\n",
+       "0          0      024 pharma incorporated                         224 datura st  edgar/data/1307969/0001683168-17-000653.txt  (732) 696-9333        0001307969  west palm beach    024 pharma, inc.            20091202    17711535            1231      10-k  b green innovations, inc.  201862731               NaN  1934 act      333-120490              plastics products, nec [3089]    fl                     nj              NaN    33401  2017-03-24         2017      new jersey            024 pharma                 FRM       True     0001307969\n",
+       "1          1  1 800 contacts incorporated  13751 s wadsworth park dr suite d140  edgar/data/1050122/0001104659-06-017311.txt      8015728225        0001050122           draper  1 800 contacts inc                 NaN    06691791            1231      10-k                        NaN  870571643               NaN  1934 act       000-23633  retail-catalog & mail-order houses [5961]    ut                     de              NaN    84020  2006-03-16         2006        delaware        1 800 contacts              KNTKTS       True     0001050122\n",
+       "2          2  1 800 contacts incorporated                66 e wadsworth park dr  edgar/data/1050122/0001104659-07-019474.txt    801-316-5000        0001050122           draper  1 800 contacts inc                 NaN    07696033            1231      10-k                        NaN  870571643               NaN  1934 act       000-23633  retail-catalog & mail-order houses [5961]    ut                     de              NaN    84020  2007-03-15         2007        delaware        1 800 contacts              KNTKTS       True     0001050122"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 101,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "raw_sec_df.head(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)"
+    "sec_df.head(3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "329e5d07-4eb4-4ba2-968e-aabf9be4937b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_asset_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2",
+   "execution_count": 102,
+   "id": "63d97f0d-df22-4c27-b3e7-1035166b4011",
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "61026"
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "### Ex. 21"
+    "len(sec_df)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ae57370a-36bb-40cf-b9f1-8ffdf373fa22",
+   "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",
    "metadata": {},
    "source": [
-    "TODO: get rid of this section"
+    "# Preprocess SEC and EIA\n",
+    "\n",
+    "Does it make more sense to do a direct match on company name after\n",
+    "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "611da616-45ef-40ae-bc06-8bfbc871274d",
+   "execution_count": 103,
+   "id": "7d2d103a-2bbd-4974-b770-44626bdc5111",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")"
+    "sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b",
+   "execution_count": 104,
+   "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
    "metadata": {},
    "outputs": [],
    "source": [
-    "raw_ex21_df = pd.DataFrame()\n",
-    "for file in ex21_path.iterdir():\n",
-    "    if file.name.split(\".\")[-1] == \"parquet\":\n",
-    "        year_quarter_df = pd.read_parquet(ex21_path / file.name)\n",
-    "        report_year = file.name[:4]\n",
-    "        year_quarter_df.loc[:, \"report_year\"] = report_year\n",
-    "        year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n",
-    "        raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b636d438-ed71-426c-8c2a-9e550fe99958",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Preprocess Ex. 21"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "917c79d4-9250-46a7-855a-14e526bbce6c",
-   "metadata": {},
-   "source": [
-    "TODO: get rid of this section"
+    "eia_match_df = eia_df[SHARED_COLS]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e",
+   "execution_count": 105,
+   "id": "e754b2ef-5a0d-4582-8694-047528dfd339",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:168: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
-      "  df = df.fillna(np.nan)\n"
-     ]
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 105,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "ex21_clean_df = prepare_ex21_df(raw_ex21_df)"
+    "sec_match_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "id": "027191c4-82fa-491b-8c73-54551c7fa4e6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_match_df = sec_clean_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\", \"report_year\"])\n",
-    "merged_df = sec_match_df.merge(ex21_clean_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))\n",
-    "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n",
-    "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n",
-    "merged_df[\"loc_overlap\"] = merged_df.apply(\n",
-    "    lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n",
-    ")\n",
-    "merged_df[\"report_year_diff\"] = merged_df.apply(\n",
-    "    lambda row: abs(int(row[\"report_year_sec\"]) - int(row[\"report_year_ex21\"])), axis=1\n",
-    ")\n",
-    "# Sort by CIK, company_name, loc_overlap, and report_year_diff\n",
-    "# so that we can then choose the first record in each CIK, company_name group\n",
-    "merged_df = merged_df.sort_values(by=[\"central_index_key\", \"company_name\", \"loc_overlap\", \"report_year_diff\"],\n",
-    "                                  ascending=[True, True, False, True]\n",
-    "                                 )\n",
-    "# Select the row with the highest loc overlap and nearest report years for each CIK and company name\n",
-    "cik_and_company_pairs = merged_df.groupby([\"central_index_key\", \"company_name\"], as_index=False).first()\n",
-    "# We now have the closest matching CIK and company name pairs\n",
-    "# We want to get the best matching CIK for each company name and loc of incorporation\n",
-    "# Select the row with the highest loc overlap and nearest report years for each company name and loc pair\n",
-    "cik_and_company_pairs = cik_and_company_pairs.sort_values(by=[\"company_name\", \"loc_of_incorporation_ex21\", \"loc_overlap\", \"report_year_diff\"],\n",
-    "                                                          ascending=[True, True, False, True]\n",
-    "                                                         )\n",
-    "closest_match = cik_and_company_pairs.groupby([\"company_name\", \"loc_of_incorporation_ex21\"], as_index=False).first()\n",
-    "closest_match = closest_match.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation_ex21\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "bd9e9f44-7ff8-4615-a5c3-ee8f32439e26",
+   "execution_count": 106,
+   "id": "38ad3504-2cde-455f-8896-6a435677541c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "False    5808\n",
-       "Name: count, dtype: int64"
+       "True"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 106,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# a company name and location of incorporation should match to only one CIK\n",
-    "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()"
+    "eia_match_df.record_id.is_unique"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "id": "64572f77-0a64-48a9-83fd-1c0179202010",
+   "execution_count": 107,
+   "id": "856c14d8-3250-4650-a2db-3808b4718f19",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "central_index_key\n",
-       "False    5532\n",
-       "True      276\n",
-       "Name: count, dtype: int64"
+       "False"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 107,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# it's okay if there's duplication here\n",
-    "# multiple subsidiaries can point to the same CIK\n",
-    "# and company names can change and they still keep the same CIK\n",
-    "closest_match.central_index_key.duplicated().value_counts()"
+    "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n",
+    "# later we'll flatten on sec_company_id and utility_id_eia\n",
+    "sec_df.sec_company_id.is_unique"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 37,
-   "id": "a669e0b7-c7fb-4c12-9121-0282e616286a",
+   "cell_type": "markdown",
+   "id": "b18fef7e-c316-4c90-b2bc-04706401135e",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ex21_with_cik = ex21_clean_df.merge(\n",
-    "    closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n",
-    "    how=\"left\",\n",
-    "    on=[\"company_name\", \"loc_of_incorporation\"],\n",
-    ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})"
+    "There can be duplicate records because sometimes a company changes utility ID or central index key over time. Keep the most recent version of that record."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "id": "245697ec-9451-47e7-953b-eba65062ee93",
+   "execution_count": 108,
+   "id": "842fa02e-5202-445c-b728-72bce42e740d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "subsidiary_cik\n",
-       "True     2900030\n",
-       "False      21674\n",
+       "False    20821\n",
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 108,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "1382a2e4-e88e-47bb-93ed-dafc576ec2f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n",
-    "                                    how=\"left\",\n",
-    "                                    on=\"company_name\"\n",
-    "                                   ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "id": "5f70e3ff-2494-4eda-bfa2-6989bcf442bb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# if a subsidiary doesn't have a CIK and has a null location\n",
-    "# but its company name was assigned a CIK (with a different location)\n",
-    "# then assign that CIK to the subsidiary\n",
-    "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n",
-    "    ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n",
-    "    ex21_with_cik[\"company_name_merge_cik\"]\n",
-    ")"
+    "eia_match_df.duplicated(subset=MATCH_COLS).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
-   "id": "63d4cc13-a4bf-4473-99bb-6d8fcf9a1174",
+   "execution_count": 109,
+   "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "subsidiary_cik\n",
-       "True     2897527\n",
-       "False      24221\n",
+       "False    61026\n",
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 109,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# there should be fewer null CIKs now\n",
-    "ex21_with_cik.subsidiary_cik.isnull().value_counts()"
+    "sec_match_df.duplicated(subset=MATCH_COLS).value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
-   "id": "e25cf09f-8bbd-4dcd-b308-71bc5a357bf5",
+   "execution_count": 253,
+   "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669",
    "metadata": {},
    "outputs": [],
    "source": [
-    "archive = GCSArchive()\n",
-    "md = archive.get_metadata()"
+    "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")\n",
+    "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "d17ed466-74d6-44e5-aaca-8dc6793712d4",
+   "cell_type": "markdown",
+   "id": "46d967d4-3722-437d-b2f0-37cbac17624f",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)"
+    "# Link SEC and EIA"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "6303051b-74bf-4043-885e-aaaf6593852d",
+   "cell_type": "markdown",
+   "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"],\n",
-    "                                    how=\"left\",\n",
-    "                                    left_on=\"filename\",\n",
-    "                                    right_index=True).rename(columns={\"cik\": \"parent_cik\"})"
+    "## Exploratory Analysis"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "da72f2d4-54a8-487a-82ec-92d9e8df091f",
+   "execution_count": 112,
+   "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
    "metadata": {},
    "outputs": [],
    "source": [
-    "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)"
+    "db_api = DuckDBAPI()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "eff49691-d17c-4a55-817d-8eeaf83900e4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# remove the Ex. 21 subsidiaries who were matched to a filing company\n",
-    "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db",
-   "metadata": {},
-   "source": [
-    "# Preprocess SEC and EIA\n",
-    "\n",
-    "Does it actually make sense to add in the Ex. 21 subsidiaries when we only have company name?\n",
-    "Does it make more sense to do a direct match on company name after\n",
-    "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dd3b1335-6ffc-4c8d-b45e-5bee9f3f48da",
-   "metadata": {},
-   "source": [
-    "TODO: get rid of these cells"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aaf6c9f9-6fe6-4259-bbc4-d8a18e55984c",
-   "metadata": {},
-   "source": [
-    "TODO: filter for only \"files_10k\" filers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "id": "8453d55d-a3ac-422d-9cef-e7f13d582efe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# find a way to use state of incorporation even though it's not on the EIA side?\n",
-    "sec_full_clean_df = pd.concat([sec_clean_df, \n",
-    "                               unmatched_ex21_df[[\"sec_company_id\", \"report_year\", \"company_name\", \"company_name_no_legal\", \"company_name_mphone\", \"state_of_incorporation\"]]\n",
-    "                              ])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "id": "2bc79d7d-b756-47d5-a61d-a3a761160250",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_full_clean_df = sec_full_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "166d3c96-93d6-4a22-afbf-8d94dc9ecfb9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# for now, just use sec_clean_df without Ex. 21 subsidiaries\n",
-    "sec_clean_df = sec_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_clean_df = prepare_eia_df(eia_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "e754b2ef-5a0d-4582-8694-047528dfd339",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sec_clean_df.record_id.is_unique"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "38ad3504-2cde-455f-8896-6a435677541c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "eia_clean_df.record_id.is_unique"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "e90de0d3-3220-4869-80a3-fc7dd381d393",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO: move this into preprocessing\n",
-    "# strip legal terms and then make a list column from company name\n",
-    "# use this for blocking and comnparison levels\n",
-    "eia_clean_df.loc[:, \"company_name_mphone_list\"] = eia_clean_df[\"company_name_mphone\"].str.split()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "b71a24f2-51b5-444f-a645-054cc3e25cf8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_clean_df.loc[:, \"company_name_mphone_list\"] = sec_clean_df[\"company_name_mphone\"].str.split()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "eb9c00dc-50a5-49cc-9589-0bf4df917ab3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_clean_df.loc[:, \"zip_code\"] = eia_clean_df[\"zip_code\"].str[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "edead864-7004-4081-ab78-313c14ff81a3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_clean_df.loc[:, \"zip_code\"] = sec_clean_df[\"zip_code\"].str[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "id": "a5af13b2-9d43-42e6-9477-1fb7d52412cf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# I think we don't need this column\n",
-    "eia_clean_df.loc[:, \"street_address_list\"] = eia_clean_df[\"street_address\"].str.split()\n",
-    "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9f7bebc3-8e79-48e9-9178-68c112bb8ee9",
-   "metadata": {},
-   "source": [
-    "TODO: import from config file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SHARED_COLS = [\n",
-    "    \"record_id\",\n",
-    "    \"report_date\",\n",
-    "    \"report_year\",\n",
-    "    \"company_name\",\n",
-    "    \"company_name_no_legal\",\n",
-    "    \"street_address\",\n",
-    "    \"street_address_list\",\n",
-    "    \"street_address_2\",\n",
-    "    \"city\",\n",
-    "    \"state\",  # could use state of incorporation from SEC\n",
-    "    \"zip_code\",\n",
-    "    \"phone_number\",\n",
-    "    \"company_name_mphone\",\n",
-    "    \"company_name_mphone_list\"\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_match_df = eia_clean_df[SHARED_COLS]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_match_df = sec_clean_df[SHARED_COLS]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "13bda908-2007-4bca-86ad-1bcf74b1b1ef",
-   "metadata": {},
-   "source": [
-    "TODO: import from config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "a4a15b86-71cf-4d8d-9c09-f82a70f10273",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "842fa02e-5202-445c-b728-72bce42e740d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True     138441\n",
-       "False     39407\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# duplicates exist because of differing report years\n",
-    "eia_match_df.duplicated(subset=match_cols).value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True     168445\n",
-       "False     64515\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sec_match_df.duplicated(subset=match_cols).value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "standard_industrial_classification\n",
-       "asset-backed securities [6189]          20311\n",
-       "pharmaceutical preparations [2834]       8530\n",
-       "state commercial banks [6022]            7886\n",
-       "real estate investment trusts [6798]     7706\n",
-       "services-prepackaged software [7372]     6007\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# could try to use keywords like gas, electricity, utility etc.\n",
-    "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "46d967d4-3722-437d-b2f0-37cbac17624f",
-   "metadata": {},
-   "source": [
-    "# Link SEC and EIA"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f",
-   "metadata": {},
-   "source": [
-    "## Exploratory Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 128,
-   "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "db_api = DuckDBAPI()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 129,
-   "id": "ac4e560b-6946-4cc7-b2bc-6d5f4b154da6",
+   "execution_count": 113,
+   "id": "4bab1568-6a55-427c-9a78-e44db8b0584d",
    "metadata": {},
    "outputs": [
     {
@@ -1060,23 +687,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed {\n",
+       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed details,\n",
-       "  #altair-viz-568ae8a9a7b0476a9476900de3419267.vega-embed details summary {\n",
+       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed details,\n",
+       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-568ae8a9a7b0476a9476900de3419267\"></div>\n",
+       "<div id=\"altair-viz-bffae9d64118401bb4629bbba335e3e7\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-568ae8a9a7b0476a9476900de3419267\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-568ae8a9a7b0476a9476900de3419267\");\n",
+       "    if (outputDiv.id !== \"altair-viz-bffae9d64118401bb4629bbba335e3e7\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-bffae9d64118401bb4629bbba335e3e7\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1122,27 +749,26 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-a6b030dc7069d2f4600013c4a9b5bad7\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-a6b030dc7069d2f4600013c4a9b5bad7\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 64515, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 96, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9985119700431824}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_list\", \"total_null_rows\": 96, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9985119700431824}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 34486, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.46545764803886414}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 69, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9989304542541504}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 109, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9983104467391968}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 274, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9957529306411743}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 2914, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9548321962356567}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone_list\", \"total_null_rows\": 2, \"total_rows_inc_nulls\": 64515, \"completeness\": 0.9999690055847168}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-54c987419c436520c4be38df6c3144f0\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-54c987419c436520c4be38df6c3144f0\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 61026, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 32198, \"total_rows_inc_nulls\": 61026, \"completeness\": 0.4723888039588928}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 14, \"total_rows_inc_nulls\": 61026, \"completeness\": 0.9997705817222595}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 38, \"total_rows_inc_nulls\": 61026, \"completeness\": 0.9993773102760315}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 168, \"total_rows_inc_nulls\": 61026, \"completeness\": 0.9972470998764038}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 2821, \"total_rows_inc_nulls\": 61026, \"completeness\": 0.9537737965583801}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 129,
+     "execution_count": 113,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# this goes way down when we start matching in the Ex. 21 subsidiaries\n",
     "completeness_chart(sec_match_df, db_api=db_api)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
-   "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b",
+   "execution_count": 114,
+   "id": "6b9479e3-e836-4407-a2b6-926c185065a8",
    "metadata": {},
    "outputs": [
     {
@@ -1150,23 +776,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed {\n",
+       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed details,\n",
-       "  #altair-viz-278a5be917034b29a93d18bbbb0a987c.vega-embed details summary {\n",
+       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed details,\n",
+       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-278a5be917034b29a93d18bbbb0a987c\"></div>\n",
+       "<div id=\"altair-viz-f131dd48afce49899469d187e41fd69b\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-278a5be917034b29a93d18bbbb0a987c\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-278a5be917034b29a93d18bbbb0a987c\");\n",
+       "    if (outputDiv.id !== \"altair-viz-f131dd48afce49899469d187e41fd69b\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-f131dd48afce49899469d187e41fd69b\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1212,14 +838,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-d98152b9bd4690e94d5eb2c5ee1c5ff9\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-d98152b9bd4690e94d5eb2c5ee1c5ff9\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 19556, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.5037429928779602}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_list\", \"total_null_rows\": 19556, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.5037429928779602}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 33097, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.16012383997440338}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 14129, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.6414596438407898}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 9299, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.7640267014503479}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 14454, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.6332123875617981}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 39039, \"total_rows_inc_nulls\": 39407, \"completeness\": 0.009338442236185074}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone_list\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 39407, \"completeness\": 1.0}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"mark\": \"rect\", \"encoding\": {\"color\": {\"field\": \"completeness\", \"legend\": null, \"scale\": {\"scheme\": \"darkred\", \"zero\": true}, \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}, {\"field\": \"total_rows_inc_nulls\", \"format\": \",\", \"title\": \"# of records\", \"type\": \"quantitative\"}, {\"field\": \"column_name\", \"title\": \"Column name\", \"type\": \"nominal\"}, {\"field\": \"total_null_rows\", \"format\": \",\", \"title\": \"# of nulls\", \"type\": \"quantitative\"}, {\"field\": \"completeness\", \"format\": \".1%\", \"type\": \"quantitative\"}], \"x\": {\"axis\": {\"labelAngle\": 20}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"title\": \"Column name\", \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"title\": \"Source dataset\", \"type\": \"nominal\"}}, \"title\": \"Column completeness by source dataset\", \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}, {\"mark\": {\"type\": \"text\"}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"datum['completeness'] < 0.5\", \"value\": \"white\"}, \"value\": \"black\"}, \"text\": {\"field\": \"completeness\", \"format\": \".0%\", \"type\": \"quantitative\"}, \"x\": {\"axis\": {\"labelAngle\": 0}, \"field\": \"column_name\", \"sort\": {\"field\": \"mean_comp\", \"order\": \"descending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"source_dataset\", \"type\": \"nominal\"}}, \"transform\": [{\"joinaggregate\": [{\"op\": \"mean\", \"field\": \"completeness\", \"as\": \"mean_comp\"}], \"groupby\": [\"column_name\"]}]}], \"data\": {\"name\": \"data-57edf6da2f9dcdae191503af3bcdc772\"}, \"height\": {\"step\": 40}, \"width\": {\"step\": 40}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-57edf6da2f9dcdae191503af3bcdc772\": [{\"source_dataset\": \"input_data_1\", \"column_name\": \"record_id\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_date\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"report_year\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_no_legal\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"company_name_mphone\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address\", \"total_null_rows\": 0, \"total_rows_inc_nulls\": 20821, \"completeness\": 1.0}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"street_address_2\", \"total_null_rows\": 13234, \"total_rows_inc_nulls\": 20821, \"completeness\": 0.3643917143344879}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"city\", \"total_null_rows\": 55, \"total_rows_inc_nulls\": 20821, \"completeness\": 0.9973584413528442}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"state\", \"total_null_rows\": 472, \"total_rows_inc_nulls\": 20821, \"completeness\": 0.9773305654525757}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"zip_code\", \"total_null_rows\": 257, \"total_rows_inc_nulls\": 20821, \"completeness\": 0.9876567125320435}, {\"source_dataset\": \"input_data_1\", \"column_name\": \"phone_number\", \"total_null_rows\": 16777, \"total_rows_inc_nulls\": 20821, \"completeness\": 0.19422698020935059}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 130,
+     "execution_count": 114,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1230,7 +856,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 115,
    "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
    "metadata": {},
    "outputs": [
@@ -1239,23 +865,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed {\n",
+       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed details,\n",
-       "  #altair-viz-122f838f22854f7fb81a5c63e6aba8bf.vega-embed details summary {\n",
+       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed details,\n",
+       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\"></div>\n",
+       "<div id=\"altair-viz-f82e60d2f54945b9a271aed10e3561ac\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-122f838f22854f7fb81a5c63e6aba8bf\");\n",
+       "    if (outputDiv.id !== \"altair-viz-f82e60d2f54945b9a271aed10e3561ac\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-f82e60d2f54945b9a271aed10e3561ac\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1301,25 +927,25 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9998449683189392, \"percentile_inc_nulls\": 0.9998449683189392, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9993489980697632, \"percentile_inc_nulls\": 0.9993489980697632, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9980469346046448, \"percentile_inc_nulls\": 0.9980469942092896, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9937686920166016, \"percentile_inc_nulls\": 0.9937688708305359, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9755553007125854, \"percentile_inc_nulls\": 0.9755560755729675, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1175.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.9293010830879211, \"percentile_inc_nulls\": 0.9293032884597778, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2984.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.806442141532898, \"percentile_inc_nulls\": 0.8064481019973755, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7926.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.535085916519165, \"percentile_inc_nulls\": 0.5351003408432007, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 17506.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 3.0994415283203125e-05, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 34520.0, \"distinct_value_count\": 46959}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10.0, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 2 values (0.0%) are null and there are 46959 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"acacia research corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"brandywine realty trust\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"citigroup incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"central european media enterprises limited\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"evolent health incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"united guardian incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"cue health incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"gs mortgage securities trust 2020 gsa2\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"accolade incorporated\", \"total_non_null_rows\": 64513, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 46959}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8313045501708984, \"percentile_inc_nulls\": 0.8315895795822144, \"value_count\": 10865, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10865.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6936154961585999, \"percentile_inc_nulls\": 0.6941331624984741, \"value_count\": 8868, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 8868.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.6153929829597473, \"percentile_inc_nulls\": 0.6160427927970886, \"value_count\": 5038, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 5038.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.5645126104354858, \"percentile_inc_nulls\": 0.5652483701705933, \"value_count\": 3277, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3277.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.525261640548706, \"percentile_inc_nulls\": 0.5260636806488037, \"value_count\": 2528, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2528.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4904046058654785, \"percentile_inc_nulls\": 0.49126559495925903, \"value_count\": 2245, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2245.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4588702917098999, \"percentile_inc_nulls\": 0.4597845673561096, \"value_count\": 2031, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2031.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.4301152229309082, \"percentile_inc_nulls\": 0.4310780167579651, \"value_count\": 1852, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1852.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.40471386909484863, \"percentile_inc_nulls\": 0.4057195782661438, \"value_count\": 1636, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1636.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.379716157913208, \"percentile_inc_nulls\": 0.3807641863822937, \"value_count\": 1610, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1610.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.35474956035614014, \"percentile_inc_nulls\": 0.35583972930908203, \"value_count\": 1608, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1608.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.3316926956176758, \"percentile_inc_nulls\": 0.3328218460083008, \"value_count\": 1485, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1485.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.30921030044555664, \"percentile_inc_nulls\": 0.31037741899490356, \"value_count\": 1448, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1448.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.2887929677963257, \"percentile_inc_nulls\": 0.28999459743499756, \"value_count\": 1315, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1315.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.26895010471343994, \"percentile_inc_nulls\": 0.2701852321624756, \"value_count\": 1278, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1278.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.25052011013031006, \"percentile_inc_nulls\": 0.25178641080856323, \"value_count\": 1187, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1187.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.23381364345550537, \"percentile_inc_nulls\": 0.2351081371307373, \"value_count\": 1076, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1076.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.21730893850326538, \"percentile_inc_nulls\": 0.2186313271522522, \"value_count\": 1063, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1063.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.20310217142105103, \"percentile_inc_nulls\": 0.20444858074188232, \"value_count\": 915, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 915.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.19002890586853027, \"percentile_inc_nulls\": 0.19139736890792847, \"value_count\": 842, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 842.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.17740583419799805, \"percentile_inc_nulls\": 0.17879563570022583, \"value_count\": 813, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 813.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.16577649116516113, \"percentile_inc_nulls\": 0.16718590259552002, \"value_count\": 749, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 749.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.15436452627182007, \"percentile_inc_nulls\": 0.15579324960708618, \"value_count\": 735, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 735.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.14396172761917114, \"percentile_inc_nulls\": 0.145408034324646, \"value_count\": 670, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 670.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.134443998336792, \"percentile_inc_nulls\": 0.1359063982963562, \"value_count\": 613, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 613.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.12548518180847168, \"percentile_inc_nulls\": 0.12696272134780884, \"value_count\": 577, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 577.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.11661958694458008, \"percentile_inc_nulls\": 0.11811208724975586, \"value_count\": 571, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 571.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.10901159048080444, \"percentile_inc_nulls\": 0.11051690578460693, \"value_count\": 490, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.1015588641166687, \"percentile_inc_nulls\": 0.10307681560516357, \"value_count\": 480, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 480.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.09496009349822998, \"percentile_inc_nulls\": 0.09648919105529785, \"value_count\": 425, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08944821357727051, \"percentile_inc_nulls\": 0.09098660945892334, \"value_count\": 355, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.08441758155822754, \"percentile_inc_nulls\": 0.08596450090408325, \"value_count\": 324, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 324.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0800391435623169, \"percentile_inc_nulls\": 0.08159345388412476, \"value_count\": 282, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.07572275400161743, \"percentile_inc_nulls\": 0.07728433609008789, \"value_count\": 278, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0716392993927002, \"percentile_inc_nulls\": 0.0732077956199646, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 263.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06772661209106445, \"percentile_inc_nulls\": 0.06930172443389893, \"value_count\": 252, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06382948160171509, \"percentile_inc_nulls\": 0.06541115045547485, \"value_count\": 251, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.06002545356750488, \"percentile_inc_nulls\": 0.06161355972290039, \"value_count\": 245, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.05636119842529297, \"percentile_inc_nulls\": 0.05795550346374512, \"value_count\": 236, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.050181686878204346, \"percentile_inc_nulls\": 0.05178642272949219, \"value_count\": 199, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 398.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04723161458969116, \"percentile_inc_nulls\": 0.04884135723114014, \"value_count\": 190, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.04459214210510254, \"percentile_inc_nulls\": 0.04620629549026489, \"value_count\": 170, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.042309701442718506, \"percentile_inc_nulls\": 0.043927788734436035, \"value_count\": 147, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.040089428424835205, \"percentile_inc_nulls\": 0.04171121120452881, \"value_count\": 143, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03811758756637573, \"percentile_inc_nulls\": 0.03974270820617676, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03622335195541382, \"percentile_inc_nulls\": 0.037851691246032715, \"value_count\": 122, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.034375667572021484, \"percentile_inc_nulls\": 0.0360071063041687, \"value_count\": 119, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03262120485305786, \"percentile_inc_nulls\": 0.03425562381744385, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.03092879056930542, \"percentile_inc_nulls\": 0.032566070556640625, \"value_count\": 109, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.029251933097839355, \"percentile_inc_nulls\": 0.030892014503479004, \"value_count\": 108, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.027590572834014893, \"percentile_inc_nulls\": 0.02923351526260376, \"value_count\": 107, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.026162147521972656, \"percentile_inc_nulls\": 0.02780747413635254, \"value_count\": 92, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.024826884269714355, \"percentile_inc_nulls\": 0.026474475860595703, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.023600280284881592, \"percentile_inc_nulls\": 0.025249958038330078, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.022389233112335205, \"percentile_inc_nulls\": 0.024040937423706055, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.02119368314743042, \"percentile_inc_nulls\": 0.022847414016723633, \"value_count\": 77, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.020060241222381592, \"percentile_inc_nulls\": 0.021715879440307617, \"value_count\": 73, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 73.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.019051015377044678, \"percentile_inc_nulls\": 0.02070838212966919, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.017156779766082764, \"percentile_inc_nulls\": 0.01881730556488037, \"value_count\": 61, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.015324652194976807, \"percentile_inc_nulls\": 0.016988277435302734, \"value_count\": 59, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.013647794723510742, \"percentile_inc_nulls\": 0.015314280986785889, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.01287144422531128, \"percentile_inc_nulls\": 0.014539241790771484, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.012203812599182129, \"percentile_inc_nulls\": 0.013872742652893066, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.011551737785339355, \"percentile_inc_nulls\": 0.01322174072265625, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.010433793067932129, \"percentile_inc_nulls\": 0.012105703353881836, \"value_count\": 36, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.009921431541442871, \"percentile_inc_nulls\": 0.011594176292419434, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00947117805480957, \"percentile_inc_nulls\": 0.011144697666168213, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008694827556610107, \"percentile_inc_nulls\": 0.010369658470153809, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.008337736129760742, \"percentile_inc_nulls\": 0.010013163089752197, \"value_count\": 23, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 23.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007996141910552979, \"percentile_inc_nulls\": 0.009672164916992188, \"value_count\": 22, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007685601711273193, \"percentile_inc_nulls\": 0.00936216115951538, \"value_count\": 20, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.007390618324279785, \"percentile_inc_nulls\": 0.009067654609680176, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0071111321449279785, \"percentile_inc_nulls\": 0.008788645267486572, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006847202777862549, \"percentile_inc_nulls\": 0.00852513313293457, \"value_count\": 17, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.006598770618438721, \"percentile_inc_nulls\": 0.00827711820602417, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005946636199951172, \"percentile_inc_nulls\": 0.0076261162757873535, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.005542933940887451, \"percentile_inc_nulls\": 0.0072231292724609375, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0045182108879089355, \"percentile_inc_nulls\": 0.006200134754180908, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.004052400588989258, \"percentile_inc_nulls\": 0.0057350993156433105, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003912687301635742, \"percentile_inc_nulls\": 0.0055956244468688965, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 9.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.003664255142211914, \"percentile_inc_nulls\": 0.005347609519958496, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0031208395957946777, \"percentile_inc_nulls\": 0.004805088043212891, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0025618672370910645, \"percentile_inc_nulls\": 0.004247069358825684, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0018631815910339355, \"percentile_inc_nulls\": 0.0035495758056640625, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0011799931526184082, \"percentile_inc_nulls\": 0.002867579460144043, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0007142424583435059, \"percentile_inc_nulls\": 0.0024025440216064453, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.00034159421920776367, \"percentile_inc_nulls\": 0.0020305514335632324, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0016895532608032227, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 173}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10865, \"group_name\": \"_state_\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 10865.0, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 109 values (0.2%) are null and there are 173 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10865, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 8868, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 5038, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 3277, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2528, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2245, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 2031, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1852, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1636, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1610, \"group_name\": \"_state_\", \"value\": \"co\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"lo\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"h9\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 64406, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 173}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10865]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.89018714427948, \"percentile_inc_nulls\": 0.8903045654296875, \"value_count\": 7077, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7077.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8659808039665222, \"percentile_inc_nulls\": 0.866124153137207, \"value_count\": 1560, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1560.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8488967418670654, \"percentile_inc_nulls\": 0.8490583896636963, \"value_count\": 1101, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1101.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8348074555397034, \"percentile_inc_nulls\": 0.8349841237068176, \"value_count\": 908, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 908.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8216956853866577, \"percentile_inc_nulls\": 0.821886420249939, \"value_count\": 845, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 845.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.8089252710342407, \"percentile_inc_nulls\": 0.8091296553611755, \"value_count\": 823, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 823.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7962014675140381, \"percentile_inc_nulls\": 0.7964194416999817, \"value_count\": 820, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 820.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7720572352409363, \"percentile_inc_nulls\": 0.7723010182380676, \"value_count\": 778, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7602488994598389, \"percentile_inc_nulls\": 0.7605053186416626, \"value_count\": 761, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 761.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7490767240524292, \"percentile_inc_nulls\": 0.7493451237678528, \"value_count\": 720, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7294944524765015, \"percentile_inc_nulls\": 0.7297837734222412, \"value_count\": 631, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1262.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7197343111038208, \"percentile_inc_nulls\": 0.720034122467041, \"value_count\": 629, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 629.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7100052833557129, \"percentile_inc_nulls\": 0.710315465927124, \"value_count\": 627, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.7003227472305298, \"percentile_inc_nulls\": 0.7006433010101318, \"value_count\": 624, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 624.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6910591721534729, \"percentile_inc_nulls\": 0.6913895606994629, \"value_count\": 597, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 597.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6818732023239136, \"percentile_inc_nulls\": 0.6822134256362915, \"value_count\": 592, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6750768423080444, \"percentile_inc_nulls\": 0.675424337387085, \"value_count\": 438, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 438.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6684821844100952, \"percentile_inc_nulls\": 0.6688367128372192, \"value_count\": 425, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6620736718177795, \"percentile_inc_nulls\": 0.6624350547790527, \"value_count\": 413, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 413.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6557738184928894, \"percentile_inc_nulls\": 0.656141996383667, \"value_count\": 406, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6501877307891846, \"percentile_inc_nulls\": 0.6505619287490845, \"value_count\": 360, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6447724103927612, \"percentile_inc_nulls\": 0.6451523303985596, \"value_count\": 349, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 349.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6395586729049683, \"percentile_inc_nulls\": 0.6399441957473755, \"value_count\": 336, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6344846487045288, \"percentile_inc_nulls\": 0.6348755955696106, \"value_count\": 327, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6294417381286621, \"percentile_inc_nulls\": 0.6298379898071289, \"value_count\": 325, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6245849132537842, \"percentile_inc_nulls\": 0.6249864101409912, \"value_count\": 313, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6198056936264038, \"percentile_inc_nulls\": 0.6202123165130615, \"value_count\": 308, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6151506900787354, \"percentile_inc_nulls\": 0.6155622601509094, \"value_count\": 300, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6105421781539917, \"percentile_inc_nulls\": 0.6109586954116821, \"value_count\": 297, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6062284708023071, \"percentile_inc_nulls\": 0.60664963722229, \"value_count\": 278, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.6020078659057617, \"percentile_inc_nulls\": 0.6024335622787476, \"value_count\": 272, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5978803634643555, \"percentile_inc_nulls\": 0.5983104705810547, \"value_count\": 266, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5937684178352356, \"percentile_inc_nulls\": 0.5942028760910034, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5898736715316772, \"percentile_inc_nulls\": 0.5903123617172241, \"value_count\": 251, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 251.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.586103081703186, \"percentile_inc_nulls\": 0.5865457653999329, \"value_count\": 243, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 243.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5823790431022644, \"percentile_inc_nulls\": 0.5828256607055664, \"value_count\": 240, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5750240087509155, \"percentile_inc_nulls\": 0.5754785537719727, \"value_count\": 237, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 474.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.571424126625061, \"percentile_inc_nulls\": 0.5718824863433838, \"value_count\": 232, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5678397417068481, \"percentile_inc_nulls\": 0.5683019161224365, \"value_count\": 231, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5643794536590576, \"percentile_inc_nulls\": 0.5648453831672668, \"value_count\": 223, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 223.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5610278248786926, \"percentile_inc_nulls\": 0.5614973306655884, \"value_count\": 216, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5543556213378906, \"percentile_inc_nulls\": 0.5548322200775146, \"value_count\": 215, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5510504841804504, \"percentile_inc_nulls\": 0.551530659198761, \"value_count\": 213, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.547947108745575, \"percentile_inc_nulls\": 0.5484305620193481, \"value_count\": 200, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5448592901229858, \"percentile_inc_nulls\": 0.5453460216522217, \"value_count\": 199, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5419421195983887, \"percentile_inc_nulls\": 0.5424319505691528, \"value_count\": 188, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5390404462814331, \"percentile_inc_nulls\": 0.5395334362983704, \"value_count\": 187, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5361698269844055, \"percentile_inc_nulls\": 0.5366659164428711, \"value_count\": 185, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5334388613700867, \"percentile_inc_nulls\": 0.5339378118515015, \"value_count\": 176, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.530769944190979, \"percentile_inc_nulls\": 0.5312718152999878, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5281320810317993, \"percentile_inc_nulls\": 0.5286367535591125, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5255252122879028, \"percentile_inc_nulls\": 0.5260326862335205, \"value_count\": 168, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5229494571685791, \"percentile_inc_nulls\": 0.5234596729278564, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5204356908798218, \"percentile_inc_nulls\": 0.5209486484527588, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5179375410079956, \"percentile_inc_nulls\": 0.518453061580658, \"value_count\": 161, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5154547691345215, \"percentile_inc_nulls\": 0.5159730315208435, \"value_count\": 160, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5105204582214355, \"percentile_inc_nulls\": 0.511043906211853, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5056171417236328, \"percentile_inc_nulls\": 0.5061458349227905, \"value_count\": 158, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 316.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.5007448196411133, \"percentile_inc_nulls\": 0.5012787580490112, \"value_count\": 157, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 314.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.49352943897247314, \"percentile_inc_nulls\": 0.4940711259841919, \"value_count\": 155, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 465.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.49113988876342773, \"percentile_inc_nulls\": 0.49168407917022705, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4865468740463257, \"percentile_inc_nulls\": 0.48709601163864136, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.48426592350006104, \"percentile_inc_nulls\": 0.4848175048828125, \"value_count\": 147, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.47973495721817017, \"percentile_inc_nulls\": 0.4802914261817932, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 292.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4775471091270447, \"percentile_inc_nulls\": 0.4781058430671692, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.475405752658844, \"percentile_inc_nulls\": 0.47596681118011475, \"value_count\": 138, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4711541533470154, \"percentile_inc_nulls\": 0.47171974182128906, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4690438508987427, \"percentile_inc_nulls\": 0.4696117043495178, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4649474024772644, \"percentile_inc_nulls\": 0.46551966667175293, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4608820080757141, \"percentile_inc_nulls\": 0.46145856380462646, \"value_count\": 131, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 262.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45886480808258057, \"percentile_inc_nulls\": 0.4594435691833496, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4568631052970886, \"percentile_inc_nulls\": 0.4574440121650696, \"value_count\": 129, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45490801334381104, \"percentile_inc_nulls\": 0.45549094676971436, \"value_count\": 126, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.45298391580581665, \"percentile_inc_nulls\": 0.4535689353942871, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.44919776916503906, \"percentile_inc_nulls\": 0.44978684186935425, \"value_count\": 122, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.44733577966690063, \"percentile_inc_nulls\": 0.4479268193244934, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4437047839164734, \"percentile_inc_nulls\": 0.44429975748062134, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4401359558105469, \"percentile_inc_nulls\": 0.4407346844673157, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.43838250637054443, \"percentile_inc_nulls\": 0.4389832019805908, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4349067211151123, \"percentile_inc_nulls\": 0.43551111221313477, \"value_count\": 112, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.43149304389953613, \"percentile_inc_nulls\": 0.4321010708808899, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42980170249938965, \"percentile_inc_nulls\": 0.43041151762008667, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42812585830688477, \"percentile_inc_nulls\": 0.4287375211715698, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42646557092666626, \"percentile_inc_nulls\": 0.4270789623260498, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4248983860015869, \"percentile_inc_nulls\": 0.42551344633102417, \"value_count\": 101, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4233776926994324, \"percentile_inc_nulls\": 0.42399442195892334, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.42036741971969604, \"percentile_inc_nulls\": 0.4209873676300049, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41890883445739746, \"percentile_inc_nulls\": 0.41953033208847046, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41602271795272827, \"percentile_inc_nulls\": 0.4166473150253296, \"value_count\": 93, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41461068391799927, \"percentile_inc_nulls\": 0.41523677110671997, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.41321414709091187, \"percentile_inc_nulls\": 0.41384172439575195, \"value_count\": 90, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.410514235496521, \"percentile_inc_nulls\": 0.4111446738243103, \"value_count\": 87, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.40784531831741333, \"percentile_inc_nulls\": 0.40847867727279663, \"value_count\": 86, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4065263867378235, \"percentile_inc_nulls\": 0.4071611166000366, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.40522295236587524, \"percentile_inc_nulls\": 0.405859112739563, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.4013592600822449, \"percentile_inc_nulls\": 0.4019995331764221, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39881449937820435, \"percentile_inc_nulls\": 0.39945751428604126, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3975576162338257, \"percentile_inc_nulls\": 0.39820194244384766, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3963162899017334, \"percentile_inc_nulls\": 0.39696192741394043, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39386463165283203, \"percentile_inc_nulls\": 0.3945128917694092, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.39023369550704956, \"percentile_inc_nulls\": 0.3908858299255371, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3878440856933594, \"percentile_inc_nulls\": 0.38849878311157227, \"value_count\": 77, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.38666480779647827, \"percentile_inc_nulls\": 0.38732075691223145, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.38317352533340454, \"percentile_inc_nulls\": 0.38383322954177856, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3809080719947815, \"percentile_inc_nulls\": 0.38157016038894653, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3797908425331116, \"percentile_inc_nulls\": 0.3804541826248169, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3775874376296997, \"percentile_inc_nulls\": 0.37825310230255127, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.37650126218795776, \"percentile_inc_nulls\": 0.37716811895370483, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3754305839538574, \"percentile_inc_nulls\": 0.3760985732078552, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.37226516008377075, \"percentile_inc_nulls\": 0.37293654680252075, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3712255358695984, \"percentile_inc_nulls\": 0.37189799547195435, \"value_count\": 67, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36917728185653687, \"percentile_inc_nulls\": 0.3698519468307495, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3661980628967285, \"percentile_inc_nulls\": 0.36687594652175903, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36522048711776733, \"percentile_inc_nulls\": 0.36589938402175903, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36425846815109253, \"percentile_inc_nulls\": 0.3649383783340454, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.36236536502838135, \"percentile_inc_nulls\": 0.36304736137390137, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3605033755302429, \"percentile_inc_nulls\": 0.3611873388290405, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35501039028167725, \"percentile_inc_nulls\": 0.3557002544403076, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 354.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35321044921875, \"percentile_inc_nulls\": 0.3539022207260132, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3523414731025696, \"percentile_inc_nulls\": 0.35303419828414917, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.35148805379867554, \"percentile_inc_nulls\": 0.35218167304992676, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 55.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34813642501831055, \"percentile_inc_nulls\": 0.3488336205482483, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34649163484573364, \"percentile_inc_nulls\": 0.3471905589103699, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.34245723485946655, \"percentile_inc_nulls\": 0.3431605100631714, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3361263871192932, \"percentile_inc_nulls\": 0.33683639764785767, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3330230116844177, \"percentile_inc_nulls\": 0.3337363600730896, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.32846105098724365, \"percentile_inc_nulls\": 0.3291792869567871, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3262265920639038, \"percentile_inc_nulls\": 0.3269472122192383, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.32330942153930664, \"percentile_inc_nulls\": 0.32403314113616943, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 188.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.31974053382873535, \"percentile_inc_nulls\": 0.32046812772750854, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 230.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3155509829521179, \"percentile_inc_nulls\": 0.31628304719924927, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.31213730573654175, \"percentile_inc_nulls\": 0.3128729462623596, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3101356029510498, \"percentile_inc_nulls\": 0.31087344884872437, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3075287938117981, \"percentile_inc_nulls\": 0.3082693815231323, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.3030754327774048, \"percentile_inc_nulls\": 0.30382078886032104, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29935139417648315, \"percentile_inc_nulls\": 0.30010074377059937, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2987462282180786, \"percentile_inc_nulls\": 0.29949623346328735, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29697734117507935, \"percentile_inc_nulls\": 0.2977291941642761, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.29295843839645386, \"percentile_inc_nulls\": 0.29371464252471924, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.28960680961608887, \"percentile_inc_nulls\": 0.29036659002304077, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.287434458732605, \"percentile_inc_nulls\": 0.2881965637207031, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.28426897525787354, \"percentile_inc_nulls\": 0.2850344777107239, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.281196653842926, \"percentile_inc_nulls\": 0.281965434551239, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2782174348831177, \"percentile_inc_nulls\": 0.27898937463760376, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2743692398071289, \"percentile_inc_nulls\": 0.2751452922821045, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 248.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2692486643791199, \"percentile_inc_nulls\": 0.2700302004814148, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2669987082481384, \"percentile_inc_nulls\": 0.26778268814086914, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2630884647369385, \"percentile_inc_nulls\": 0.26387661695480347, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.25889891386032104, \"percentile_inc_nulls\": 0.2596915364265442, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2552679777145386, \"percentile_inc_nulls\": 0.2560644745826721, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2502250075340271, \"percentile_inc_nulls\": 0.25102686882019043, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 325.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.24426651000976562, \"percentile_inc_nulls\": 0.2450748085975647, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 384.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2389131784439087, \"percentile_inc_nulls\": 0.2397271990776062, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.23447537422180176, \"percentile_inc_nulls\": 0.23529410362243652, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 286.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2289358377456665, \"percentile_inc_nulls\": 0.2297605276107788, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 357.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.22428077459335327, \"percentile_inc_nulls\": 0.22511041164398193, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.21691030263900757, \"percentile_inc_nulls\": 0.21774780750274658, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 475.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.21216213703155518, \"percentile_inc_nulls\": 0.2130047082901001, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.20794153213500977, \"percentile_inc_nulls\": 0.20878863334655762, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.2014865279197693, \"percentile_inc_nulls\": 0.20234054327011108, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.19380569458007812, \"percentile_inc_nulls\": 0.19466793537139893, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.18620240688323975, \"percentile_inc_nulls\": 0.18707275390625, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 490.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.17853707075119019, \"percentile_inc_nulls\": 0.17941564321517944, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 494.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.17071658372879028, \"percentile_inc_nulls\": 0.1716035008430481, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.16269433498382568, \"percentile_inc_nulls\": 0.16358983516693115, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 517.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.15384972095489502, \"percentile_inc_nulls\": 0.15475469827651978, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1442137360572815, \"percentile_inc_nulls\": 0.1451290249824524, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1342829465866089, \"percentile_inc_nulls\": 0.1352088451385498, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 640.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.12331253290176392, \"percentile_inc_nulls\": 0.12425017356872559, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 707.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.1114886999130249, \"percentile_inc_nulls\": 0.11243897676467896, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 762.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.09736835956573486, \"percentile_inc_nulls\": 0.09833371639251709, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 910.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.08272039890289307, \"percentile_inc_nulls\": 0.083701491355896, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 944.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.06396055221557617, \"percentile_inc_nulls\": 0.06496161222457886, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1209.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.038668036460876465, \"percentile_inc_nulls\": 0.03969621658325195, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1630.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0010695457458496094, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2492.0, \"distinct_value_count\": 5261}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 7077, \"group_name\": \"_city_\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 7077.0, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 69 values (0.1%) are null and there are 5261 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 7077, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1560, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1101, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 908, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 845, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 823, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 820, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 778, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 778, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 761, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"new prague\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"vallejo\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"watseka\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"temple\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"elk city\", \"total_non_null_rows\": 64446, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 5261}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 7077]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9876123666763306, \"percentile_inc_nulls\": 0.9876307845115662, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9790434241294861, \"percentile_inc_nulls\": 0.9790746569633484, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.973858654499054, \"percentile_inc_nulls\": 0.9738975167274475, \"value_count\": 334, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 334.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9691861271858215, \"percentile_inc_nulls\": 0.9692319631576538, \"value_count\": 301, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 301.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9648861289024353, \"percentile_inc_nulls\": 0.9649384021759033, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.960710346698761, \"percentile_inc_nulls\": 0.9607688188552856, \"value_count\": 269, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 269.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9570778608322144, \"percentile_inc_nulls\": 0.9571417570114136, \"value_count\": 234, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9534919857978821, \"percentile_inc_nulls\": 0.9535611867904663, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9500147700309753, \"percentile_inc_nulls\": 0.9500890970230103, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9466617107391357, \"percentile_inc_nulls\": 0.9467410445213318, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9434173107147217, \"percentile_inc_nulls\": 0.9435015320777893, \"value_count\": 209, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9403126239776611, \"percentile_inc_nulls\": 0.9404014348983765, \"value_count\": 200, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9372545480728149, \"percentile_inc_nulls\": 0.9373478889465332, \"value_count\": 197, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9343982338905334, \"percentile_inc_nulls\": 0.9344958662986755, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9317126870155334, \"percentile_inc_nulls\": 0.9318143129348755, \"value_count\": 173, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.929104745388031, \"percentile_inc_nulls\": 0.9292102456092834, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9266365766525269, \"percentile_inc_nulls\": 0.9267457127571106, \"value_count\": 159, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9242925047874451, \"percentile_inc_nulls\": 0.9244051575660706, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9222744703292847, \"percentile_inc_nulls\": 0.9223901629447937, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9203029870986938, \"percentile_inc_nulls\": 0.9204216003417969, \"value_count\": 127, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9183470606803894, \"percentile_inc_nulls\": 0.9184685945510864, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9164066314697266, \"percentile_inc_nulls\": 0.9165310263633728, \"value_count\": 125, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9146214723587036, \"percentile_inc_nulls\": 0.91474848985672, \"value_count\": 115, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9129294157028198, \"percentile_inc_nulls\": 0.9130589962005615, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.911268413066864, \"percentile_inc_nulls\": 0.9114004373550415, \"value_count\": 107, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9096229076385498, \"percentile_inc_nulls\": 0.9097574353218079, \"value_count\": 106, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9081016778945923, \"percentile_inc_nulls\": 0.908238410949707, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9066113829612732, \"percentile_inc_nulls\": 0.9067503809928894, \"value_count\": 96, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9051367044448853, \"percentile_inc_nulls\": 0.9052778482437134, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9036774635314941, \"percentile_inc_nulls\": 0.903820812702179, \"value_count\": 94, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.9022338390350342, \"percentile_inc_nulls\": 0.9023792743682861, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.900805652141571, \"percentile_inc_nulls\": 0.9009532928466797, \"value_count\": 92, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8994551301002502, \"percentile_inc_nulls\": 0.8996047377586365, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.898213267326355, \"percentile_inc_nulls\": 0.8983647227287292, \"value_count\": 80, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8970179557800293, \"percentile_inc_nulls\": 0.8971711993217468, \"value_count\": 77, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8959158062934875, \"percentile_inc_nulls\": 0.896070659160614, \"value_count\": 71, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8937735557556152, \"percentile_inc_nulls\": 0.8939316272735596, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8927335143089294, \"percentile_inc_nulls\": 0.8928931355476379, \"value_count\": 67, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8917089700698853, \"percentile_inc_nulls\": 0.8918701410293579, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.890715479850769, \"percentile_inc_nulls\": 0.8908780813217163, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8897685408592224, \"percentile_inc_nulls\": 0.8899325728416443, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8888682126998901, \"percentile_inc_nulls\": 0.8890335559844971, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.887129545211792, \"percentile_inc_nulls\": 0.887297511100769, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8863068222999573, \"percentile_inc_nulls\": 0.8864760398864746, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8839317560195923, \"percentile_inc_nulls\": 0.8841044902801514, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8831555843353271, \"percentile_inc_nulls\": 0.883329451084137, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8823949694633484, \"percentile_inc_nulls\": 0.8825699687004089, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8816808462142944, \"percentile_inc_nulls\": 0.8818569183349609, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8810133934020996, \"percentile_inc_nulls\": 0.8811904191970825, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.880376935005188, \"percentile_inc_nulls\": 0.8805549144744873, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.879755973815918, \"percentile_inc_nulls\": 0.8799349069595337, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.877939760684967, \"percentile_inc_nulls\": 0.8781213760375977, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.877349853515625, \"percentile_inc_nulls\": 0.8775323629379272, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 38.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8762011528015137, \"percentile_inc_nulls\": 0.8763853311538696, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8745245933532715, \"percentile_inc_nulls\": 0.874711275100708, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8734379410743713, \"percentile_inc_nulls\": 0.8736262917518616, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8718545436859131, \"percentile_inc_nulls\": 0.8720452785491943, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8703177571296692, \"percentile_inc_nulls\": 0.8705106973648071, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8683307766914368, \"percentile_inc_nulls\": 0.8685266971588135, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.866887092590332, \"percentile_inc_nulls\": 0.8670851588249207, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8654899597167969, \"percentile_inc_nulls\": 0.8656901717185974, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8645896315574646, \"percentile_inc_nulls\": 0.8647911548614502, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8628510236740112, \"percentile_inc_nulls\": 0.8630551099777222, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8599171042442322, \"percentile_inc_nulls\": 0.8601255416870117, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.859513521194458, \"percentile_inc_nulls\": 0.8597225546836853, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.85679692029953, \"percentile_inc_nulls\": 0.857010006904602, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8545615673065186, \"percentile_inc_nulls\": 0.8547779321670532, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8527763485908508, \"percentile_inc_nulls\": 0.8529953956604004, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8507272601127625, \"percentile_inc_nulls\": 0.8509494066238403, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8490973114967346, \"percentile_inc_nulls\": 0.8493218421936035, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8463031053543091, \"percentile_inc_nulls\": 0.8465318083763123, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8439435362815857, \"percentile_inc_nulls\": 0.8441757559776306, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8419876098632812, \"percentile_inc_nulls\": 0.8422227501869202, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8406681418418884, \"percentile_inc_nulls\": 0.8409051895141602, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.837687611579895, \"percentile_inc_nulls\": 0.8379291296005249, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8362905383110046, \"percentile_inc_nulls\": 0.8365341424942017, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8334652781486511, \"percentile_inc_nulls\": 0.8337130546569824, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.830034613609314, \"percentile_inc_nulls\": 0.830287516117096, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8257501721382141, \"percentile_inc_nulls\": 0.8260094523429871, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8211396932601929, \"percentile_inc_nulls\": 0.8214058876037598, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8163274526596069, \"percentile_inc_nulls\": 0.8166007995605469, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 310.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8086434006690979, \"percentile_inc_nulls\": 0.8089281320571899, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.8003228902816772, \"percentile_inc_nulls\": 0.8006200194358826, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 536.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7909778356552124, \"percentile_inc_nulls\": 0.7912888526916504, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 602.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7787764668464661, \"percentile_inc_nulls\": 0.7791056632995605, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7628650069236755, \"percentile_inc_nulls\": 0.7632178664207458, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1025.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.7326254844665527, \"percentile_inc_nulls\": 0.7330232858657837, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1948.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.6715720891952515, \"percentile_inc_nulls\": 0.6720607280731201, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3933.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.4995110034942627, \"percentile_inc_nulls\": 0.5002557635307312, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 11084.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.001488029956817627, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 32178.0, \"distinct_value_count\": 40345}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 96 values (0.1%) are null and there are 40345 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high street\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 334, \"group_name\": \"_street_address_\", \"value\": \"383 madison avenue\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 301, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lake blvd\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 269, \"group_name\": \"_street_address_\", \"value\": \"11 madison avenue\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 234, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"c/o wilmington trust company\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial center floor 10\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 216, \"group_name\": \"_street_address_\", \"value\": \"85 broad street\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2100 east 54th street north\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"900 e. old settlers boulevard, suite 100\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2400 ellis road\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"3000 olympus blvd.\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"410 monon blvd\", \"total_non_null_rows\": 64419, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 40345}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9859902262687683, \"percentile_inc_nulls\": 0.9860497713088989, \"value_count\": 900, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9729300737380981, \"percentile_inc_nulls\": 0.9730450510978699, \"value_count\": 839, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 839.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9513862133026123, \"percentile_inc_nulls\": 0.9515926837921143, \"value_count\": 692, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1384.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9406920671463013, \"percentile_inc_nulls\": 0.9409439563751221, \"value_count\": 687, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 687.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.932099461555481, \"percentile_inc_nulls\": 0.9323878288269043, \"value_count\": 552, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9242540001869202, \"percentile_inc_nulls\": 0.924575686454773, \"value_count\": 504, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9171868562698364, \"percentile_inc_nulls\": 0.9175385236740112, \"value_count\": 454, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 454.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9101663827896118, \"percentile_inc_nulls\": 0.9105479121208191, \"value_count\": 451, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 451.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.9037219285964966, \"percentile_inc_nulls\": 0.9041308164596558, \"value_count\": 414, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 414.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8973397016525269, \"percentile_inc_nulls\": 0.8977757096290588, \"value_count\": 410, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8914711475372314, \"percentile_inc_nulls\": 0.8919321298599243, \"value_count\": 377, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 377.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8856960535049438, \"percentile_inc_nulls\": 0.8861815333366394, \"value_count\": 371, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 371.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8799831867218018, \"percentile_inc_nulls\": 0.8804929256439209, \"value_count\": 367, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 367.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8744571208953857, \"percentile_inc_nulls\": 0.8749903440475464, \"value_count\": 355, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 355.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8690711259841919, \"percentile_inc_nulls\": 0.8696272373199463, \"value_count\": 346, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 346.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8637007474899292, \"percentile_inc_nulls\": 0.8642796277999878, \"value_count\": 345, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 345.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8586105108261108, \"percentile_inc_nulls\": 0.8592110276222229, \"value_count\": 327, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 327.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8535358905792236, \"percentile_inc_nulls\": 0.8541579246520996, \"value_count\": 326, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 326.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8486013412475586, \"percentile_inc_nulls\": 0.8492443561553955, \"value_count\": 317, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 317.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8438692092895508, \"percentile_inc_nulls\": 0.8445322513580322, \"value_count\": 304, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8344670534133911, \"percentile_inc_nulls\": 0.8351701498031616, \"value_count\": 302, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 604.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8301396369934082, \"percentile_inc_nulls\": 0.8308610320091248, \"value_count\": 278, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8261702060699463, \"percentile_inc_nulls\": 0.8269084692001343, \"value_count\": 255, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 255.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8224809765815735, \"percentile_inc_nulls\": 0.8232349157333374, \"value_count\": 237, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 237.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8191341757774353, \"percentile_inc_nulls\": 0.8199023604393005, \"value_count\": 215, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 215.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8160520792007446, \"percentile_inc_nulls\": 0.8168332576751709, \"value_count\": 198, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8130165934562683, \"percentile_inc_nulls\": 0.8138107061386108, \"value_count\": 195, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8099967241287231, \"percentile_inc_nulls\": 0.8108037114143372, \"value_count\": 194, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8069924116134644, \"percentile_inc_nulls\": 0.8078121542930603, \"value_count\": 193, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8040503859519958, \"percentile_inc_nulls\": 0.8048825860023499, \"value_count\": 189, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.8011394739151001, \"percentile_inc_nulls\": 0.8019840717315674, \"value_count\": 187, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7954110503196716, \"percentile_inc_nulls\": 0.7962799072265625, \"value_count\": 184, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 368.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7928581237792969, \"percentile_inc_nulls\": 0.7937378883361816, \"value_count\": 164, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7903519868850708, \"percentile_inc_nulls\": 0.7912423610687256, \"value_count\": 161, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7878613471984863, \"percentile_inc_nulls\": 0.7887623310089111, \"value_count\": 160, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7829112410545349, \"percentile_inc_nulls\": 0.7838332056999207, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 318.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7804672718048096, \"percentile_inc_nulls\": 0.781399667263031, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7757973670959473, \"percentile_inc_nulls\": 0.7767496109008789, \"value_count\": 150, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7735869884490967, \"percentile_inc_nulls\": 0.7745485305786133, \"value_count\": 142, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7691972255706787, \"percentile_inc_nulls\": 0.7701774835586548, \"value_count\": 141, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7648698091506958, \"percentile_inc_nulls\": 0.7658684253692627, \"value_count\": 139, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 278.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7627216577529907, \"percentile_inc_nulls\": 0.7637293338775635, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7605890035629272, \"percentile_inc_nulls\": 0.7616058588027954, \"value_count\": 137, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7585031390190125, \"percentile_inc_nulls\": 0.7595287561416626, \"value_count\": 134, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7564639449119568, \"percentile_inc_nulls\": 0.7574982643127441, \"value_count\": 131, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 131.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7524166703224182, \"percentile_inc_nulls\": 0.7534681558609009, \"value_count\": 130, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7484316825866699, \"percentile_inc_nulls\": 0.7495001554489136, \"value_count\": 128, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 256.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7446334958076477, \"percentile_inc_nulls\": 0.7457180619239807, \"value_count\": 122, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7427811026573181, \"percentile_inc_nulls\": 0.7438734769821167, \"value_count\": 119, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 119.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.739169716835022, \"percentile_inc_nulls\": 0.7402774691581726, \"value_count\": 116, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7356828451156616, \"percentile_inc_nulls\": 0.7368053793907166, \"value_count\": 112, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7339705228805542, \"percentile_inc_nulls\": 0.7351003885269165, \"value_count\": 110, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7322738170623779, \"percentile_inc_nulls\": 0.7334108352661133, \"value_count\": 109, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7305926084518433, \"percentile_inc_nulls\": 0.7317367792129517, \"value_count\": 108, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7272614240646362, \"percentile_inc_nulls\": 0.7284197807312012, \"value_count\": 107, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7256580591201782, \"percentile_inc_nulls\": 0.7268232107162476, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7224825620651245, \"percentile_inc_nulls\": 0.7236611843109131, \"value_count\": 102, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7209103107452393, \"percentile_inc_nulls\": 0.7220956087112427, \"value_count\": 101, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.717797040939331, \"percentile_inc_nulls\": 0.7189955711364746, \"value_count\": 100, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7116950154304504, \"percentile_inc_nulls\": 0.7129194736480713, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7086751461029053, \"percentile_inc_nulls\": 0.7099124193191528, \"value_count\": 97, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7056863903999329, \"percentile_inc_nulls\": 0.7069363594055176, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7042075991630554, \"percentile_inc_nulls\": 0.7054638862609863, \"value_count\": 95, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7027443647384644, \"percentile_inc_nulls\": 0.7040067911148071, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.7012966871261597, \"percentile_inc_nulls\": 0.7025653123855591, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6998645663261414, \"percentile_inc_nulls\": 0.7011392712593079, \"value_count\": 92, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6956616640090942, \"percentile_inc_nulls\": 0.6969541907310486, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6942762136459351, \"percentile_inc_nulls\": 0.6955746412277222, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.690166711807251, \"percentile_inc_nulls\": 0.6914826035499573, \"value_count\": 88, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.686103880405426, \"percentile_inc_nulls\": 0.6874370574951172, \"value_count\": 87, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.682087779045105, \"percentile_inc_nulls\": 0.6834379434585571, \"value_count\": 86, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6767951846122742, \"percentile_inc_nulls\": 0.6781678795814514, \"value_count\": 85, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 340.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6741800308227539, \"percentile_inc_nulls\": 0.6755638122558594, \"value_count\": 84, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6715960502624512, \"percentile_inc_nulls\": 0.6729907989501953, \"value_count\": 83, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6690431237220764, \"percentile_inc_nulls\": 0.6704487204551697, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6665213704109192, \"percentile_inc_nulls\": 0.667937695980072, \"value_count\": 81, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6652916669845581, \"percentile_inc_nulls\": 0.6667131781578064, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6628632545471191, \"percentile_inc_nulls\": 0.6642951369285583, \"value_count\": 78, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6592674255371094, \"percentile_inc_nulls\": 0.6607145667076111, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6545352935791016, \"percentile_inc_nulls\": 0.6560025215148926, \"value_count\": 76, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 304.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.651079535484314, \"percentile_inc_nulls\": 0.6525614261627197, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 222.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6488068103790283, \"percentile_inc_nulls\": 0.6502983570098877, \"value_count\": 73, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.646565318107605, \"percentile_inc_nulls\": 0.6480663418769836, \"value_count\": 72, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6432496309280396, \"percentile_inc_nulls\": 0.64476478099823, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 213.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6388910412788391, \"percentile_inc_nulls\": 0.6404247283935547, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.636742889881134, \"percentile_inc_nulls\": 0.6382856369018555, \"value_count\": 69, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6346258521080017, \"percentile_inc_nulls\": 0.636177659034729, \"value_count\": 68, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6294889450073242, \"percentile_inc_nulls\": 0.6310625076293945, \"value_count\": 66, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6264535188674927, \"percentile_inc_nulls\": 0.6280399560928345, \"value_count\": 65, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6234647631645203, \"percentile_inc_nulls\": 0.6250638961791992, \"value_count\": 64, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6195420026779175, \"percentile_inc_nulls\": 0.6211578845977783, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6166466474533081, \"percentile_inc_nulls\": 0.6182748079299927, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6147475838661194, \"percentile_inc_nulls\": 0.6163837909698486, \"value_count\": 61, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6110116243362427, \"percentile_inc_nulls\": 0.612663745880127, \"value_count\": 60, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6100932359695435, \"percentile_inc_nulls\": 0.6117491722106934, \"value_count\": 59, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.6055790185928345, \"percentile_inc_nulls\": 0.607254147529602, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.600348711013794, \"percentile_inc_nulls\": 0.602046012878418, \"value_count\": 56, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5943556427955627, \"percentile_inc_nulls\": 0.5960783958435059, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5901527404785156, \"percentile_inc_nulls\": 0.5918933153152466, \"value_count\": 54, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5893276929855347, \"percentile_inc_nulls\": 0.5910718441009521, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5877087712287903, \"percentile_inc_nulls\": 0.5894597768783569, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5829454660415649, \"percentile_inc_nulls\": 0.5847167372703552, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5790538787841797, \"percentile_inc_nulls\": 0.5808416604995728, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 250.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.578291118144989, \"percentile_inc_nulls\": 0.5800821781158447, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5753023624420166, \"percentile_inc_nulls\": 0.5771061182022095, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5716443061828613, \"percentile_inc_nulls\": 0.5734635591506958, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 235.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.563767671585083, \"percentile_inc_nulls\": 0.5656204223632812, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5560623407363892, \"percentile_inc_nulls\": 0.5579477548599243, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 495.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5533226728439331, \"percentile_inc_nulls\": 0.5552197098731995, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5459597110748291, \"percentile_inc_nulls\": 0.5478881001472473, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 473.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5407294034957886, \"percentile_inc_nulls\": 0.542680025100708, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5349854230880737, \"percentile_inc_nulls\": 0.5369603633880615, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 369.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5312495231628418, \"percentile_inc_nulls\": 0.5332403182983398, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5269998908042908, \"percentile_inc_nulls\": 0.5290087461471558, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5204931497573853, \"percentile_inc_nulls\": 0.5225296020507812, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 418.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5158854722976685, \"percentile_inc_nulls\": 0.5179415941238403, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5119627714157104, \"percentile_inc_nulls\": 0.5140354633331299, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5081490278244019, \"percentile_inc_nulls\": 0.5102379322052002, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.5023272037506104, \"percentile_inc_nulls\": 0.5044407844543457, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.49564921855926514, \"percentile_inc_nulls\": 0.49779123067855835, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 429.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.48718106746673584, \"percentile_inc_nulls\": 0.4893590807914734, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.47897762060165405, \"percentile_inc_nulls\": 0.48119044303894043, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 527.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4687038064002991, \"percentile_inc_nulls\": 0.47096025943756104, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 660.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.46238380670547485, \"percentile_inc_nulls\": 0.4646671414375305, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 406.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4549742341041565, \"percentile_inc_nulls\": 0.4572889804840088, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 476.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.44951045513153076, \"percentile_inc_nulls\": 0.4518483877182007, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4426301121711731, \"percentile_inc_nulls\": 0.44499731063842773, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 442.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4317336082458496, \"percentile_inc_nulls\": 0.4341471195220947, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 700.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4261297583580017, \"percentile_inc_nulls\": 0.42856699228286743, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4182531237602234, \"percentile_inc_nulls\": 0.4207238554954529, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 506.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.40934914350509644, \"percentile_inc_nulls\": 0.4118577241897583, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 572.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.4001961350440979, \"percentile_inc_nulls\": 0.40274351835250854, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 588.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3930355906486511, \"percentile_inc_nulls\": 0.395613431930542, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 460.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3841627836227417, \"percentile_inc_nulls\": 0.3867782950401306, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 570.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3754767179489136, \"percentile_inc_nulls\": 0.37812912464141846, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 558.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3662147521972656, \"percentile_inc_nulls\": 0.3689064383506775, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.35450881719589233, \"percentile_inc_nulls\": 0.35725027322769165, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 752.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.34353452920913696, \"percentile_inc_nulls\": 0.34632253646850586, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.33242011070251465, \"percentile_inc_nulls\": 0.33525538444519043, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.32088541984558105, \"percentile_inc_nulls\": 0.3237696886062622, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 741.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.3087436556816101, \"percentile_inc_nulls\": 0.311679482460022, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 780.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.29453152418136597, \"percentile_inc_nulls\": 0.2975277304649353, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 913.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.279743492603302, \"percentile_inc_nulls\": 0.2828024625778198, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 950.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.264893114566803, \"percentile_inc_nulls\": 0.26801520586013794, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 954.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.24857956171035767, \"percentile_inc_nulls\": 0.25177091360092163, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1048.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.2263507843017578, \"percentile_inc_nulls\": 0.22963649034500122, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1428.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.20225399732589722, \"percentile_inc_nulls\": 0.20564210414886475, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1548.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.1747015118598938, \"percentile_inc_nulls\": 0.17820662260055542, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1770.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.14406687021255493, \"percentile_inc_nulls\": 0.14770209789276123, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 1968.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.10684764385223389, \"percentile_inc_nulls\": 0.11064094305038452, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 2391.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.05915224552154541, \"percentile_inc_nulls\": 0.06314808130264282, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3064.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.004247069358825684, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 3800.0, \"distinct_value_count\": 8809}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 900, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 274 values (0.4%) are null and there are 8809 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 900, \"group_name\": \"_zip_code_\", \"value\": \"91302\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 839, \"group_name\": \"_zip_code_\", \"value\": \"10019\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 692, \"group_name\": \"_zip_code_\", \"value\": \"10022\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 692, \"group_name\": \"_zip_code_\", \"value\": \"00000\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 687, \"group_name\": \"_zip_code_\", \"value\": \"10036\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 552, \"group_name\": \"_zip_code_\", \"value\": \"50392\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 504, \"group_name\": \"_zip_code_\", \"value\": \"21044\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 454, \"group_name\": \"_zip_code_\", \"value\": \"10010\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 451, \"group_name\": \"_zip_code_\", \"value\": \"55437\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 414, \"group_name\": \"_zip_code_\", \"value\": \"10281\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"56071\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"02476\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"rg41 \", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"19804\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"54403\", \"total_non_null_rows\": 64241, \"total_rows_inc_nulls\": 64515, \"distinct_value_count\": 8809}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 900]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9997377991676331, \"percentile_inc_nulls\": 0.9997377991676331, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9989348649978638, \"percentile_inc_nulls\": 0.9989348649978638, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9960836172103882, \"percentile_inc_nulls\": 0.9960836172103882, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9845311641693115, \"percentile_inc_nulls\": 0.9845311641693115, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9480876922607422, \"percentile_inc_nulls\": 0.9480876922607422, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2224.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.8398387432098389, \"percentile_inc_nulls\": 0.8398387432098389, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6606.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.5750991106033325, \"percentile_inc_nulls\": 0.5750991106033325, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16156.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35096.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 46111 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"comerica inc /new/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"stillwater mining co /de/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"camelot corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 6, \"group_name\": \"_company_name_\", \"value\": \"green plains renewable energy incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"12 retech corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"11 good energy incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"1 lane technologies corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"10x capital venture acquisition corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"155 east tropicana limited liability company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 8]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8305076360702515, \"percentile_inc_nulls\": 0.8306131958961487, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.691791832447052, \"percentile_inc_nulls\": 0.6919837594032288, \"value_count\": 8460, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 8460.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.6141864061355591, \"percentile_inc_nulls\": 0.6144266128540039, \"value_count\": 4733, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 4733.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.563307523727417, \"percentile_inc_nulls\": 0.5635794401168823, \"value_count\": 3103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.5253000259399414, \"percentile_inc_nulls\": 0.5255956649780273, \"value_count\": 2318, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2318.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.49045711755752563, \"percentile_inc_nulls\": 0.4907744526863098, \"value_count\": 2125, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2125.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4584508538246155, \"percentile_inc_nulls\": 0.45878803730010986, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.42987143993377686, \"percentile_inc_nulls\": 0.4302264451980591, \"value_count\": 1743, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1743.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4037187695503235, \"percentile_inc_nulls\": 0.4040900468826294, \"value_count\": 1595, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1595.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3788614273071289, \"percentile_inc_nulls\": 0.3792482018470764, \"value_count\": 1516, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1516.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.35418444871902466, \"percentile_inc_nulls\": 0.35458654165267944, \"value_count\": 1505, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1505.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.33116352558135986, \"percentile_inc_nulls\": 0.3315799832344055, \"value_count\": 1404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3082737326622009, \"percentile_inc_nulls\": 0.3087044954299927, \"value_count\": 1396, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1396.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.2873516082763672, \"percentile_inc_nulls\": 0.28779536485671997, \"value_count\": 1276, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1276.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.26756083965301514, \"percentile_inc_nulls\": 0.2680169343948364, \"value_count\": 1207, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1207.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.24914735555648804, \"percentile_inc_nulls\": 0.2496148943901062, \"value_count\": 1123, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1123.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.23243916034698486, \"percentile_inc_nulls\": 0.23291712999343872, \"value_count\": 1019, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.21587854623794556, \"percentile_inc_nulls\": 0.21636676788330078, \"value_count\": 1010, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.20199054479599, \"percentile_inc_nulls\": 0.20248746871948242, \"value_count\": 847, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 847.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.18875843286514282, \"percentile_inc_nulls\": 0.18926358222961426, \"value_count\": 807, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 807.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.17613303661346436, \"percentile_inc_nulls\": 0.17664599418640137, \"value_count\": 770, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 770.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.1640486717224121, \"percentile_inc_nulls\": 0.1645691990852356, \"value_count\": 737, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 737.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.15265297889709473, \"percentile_inc_nulls\": 0.15318059921264648, \"value_count\": 695, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 695.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.14209353923797607, \"percentile_inc_nulls\": 0.14262771606445312, \"value_count\": 644, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.13258343935012817, \"percentile_inc_nulls\": 0.13312357664108276, \"value_count\": 580, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.12354886531829834, \"percentile_inc_nulls\": 0.12409466505050659, \"value_count\": 551, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 551.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.11457991600036621, \"percentile_inc_nulls\": 0.11513125896453857, \"value_count\": 547, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 547.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10720139741897583, \"percentile_inc_nulls\": 0.1077573299407959, \"value_count\": 450, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10013443231582642, \"percentile_inc_nulls\": 0.10069477558135986, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0935102105140686, \"percentile_inc_nulls\": 0.09407466650009155, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08788615465164185, \"percentile_inc_nulls\": 0.08845412731170654, \"value_count\": 343, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08291792869567871, \"percentile_inc_nulls\": 0.08348900079727173, \"value_count\": 303, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07842528820037842, \"percentile_inc_nulls\": 0.0789991021156311, \"value_count\": 274, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07416212558746338, \"percentile_inc_nulls\": 0.07473862171173096, \"value_count\": 260, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07007938623428345, \"percentile_inc_nulls\": 0.07065838575363159, \"value_count\": 249, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.06240570545196533, \"percentile_inc_nulls\": 0.06298953294754028, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0587000846862793, \"percentile_inc_nulls\": 0.05928617715835571, \"value_count\": 226, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05512559413909912, \"percentile_inc_nulls\": 0.055713951587677, \"value_count\": 218, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05199384689331055, \"percentile_inc_nulls\": 0.052584171295166016, \"value_count\": 191, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.048944056034088135, \"percentile_inc_nulls\": 0.049536287784576416, \"value_count\": 186, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04597628116607666, \"percentile_inc_nulls\": 0.0465703010559082, \"value_count\": 181, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04348397254943848, \"percentile_inc_nulls\": 0.044079601764678955, \"value_count\": 152, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.041270434856414795, \"percentile_inc_nulls\": 0.04186737537384033, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03918802738189697, \"percentile_inc_nulls\": 0.039786338806152344, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.037335216999053955, \"percentile_inc_nulls\": 0.03793466091156006, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03549879789352417, \"percentile_inc_nulls\": 0.036099374294281006, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03369516134262085, \"percentile_inc_nulls\": 0.03429687023162842, \"value_count\": 110, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03195708990097046, \"percentile_inc_nulls\": 0.032559871673583984, \"value_count\": 106, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03026825189590454, \"percentile_inc_nulls\": 0.030872106552124023, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.028595805168151855, \"percentile_inc_nulls\": 0.02920067310333252, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.026972532272338867, \"percentile_inc_nulls\": 0.027578413486480713, \"value_count\": 99, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.02552962303161621, \"percentile_inc_nulls\": 0.026136398315429688, \"value_count\": 88, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.022938966751098633, \"percentile_inc_nulls\": 0.023547351360321045, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.021774768829345703, \"percentile_inc_nulls\": 0.022383928298950195, \"value_count\": 71, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.020659804344177246, \"percentile_inc_nulls\": 0.021269619464874268, \"value_count\": 68, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01956123113632202, \"percentile_inc_nulls\": 0.020171701908111572, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.018577396869659424, \"percentile_inc_nulls\": 0.019188523292541504, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.017626404762268066, \"percentile_inc_nulls\": 0.0182381272315979, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.015757203102111816, \"percentile_inc_nulls\": 0.016370058059692383, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014838993549346924, \"percentile_inc_nulls\": 0.015452444553375244, \"value_count\": 56, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014019131660461426, \"percentile_inc_nulls\": 0.014633119106292725, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0132485032081604, \"percentile_inc_nulls\": 0.013862967491149902, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.012494266033172607, \"percentile_inc_nulls\": 0.013109147548675537, \"value_count\": 46, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.011789202690124512, \"percentile_inc_nulls\": 0.012404561042785645, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01121532917022705, \"percentile_inc_nulls\": 0.011831045150756836, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010674238204956055, \"percentile_inc_nulls\": 0.011290252208709717, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010149538516998291, \"percentile_inc_nulls\": 0.010765910148620605, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009674012660980225, \"percentile_inc_nulls\": 0.010290682315826416, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009264111518859863, \"percentile_inc_nulls\": 0.009881019592285156, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008870601654052734, \"percentile_inc_nulls\": 0.009487748146057129, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008181929588317871, \"percentile_inc_nulls\": 0.008799552917480469, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007870376110076904, \"percentile_inc_nulls\": 0.008488178253173828, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007575273513793945, \"percentile_inc_nulls\": 0.008193254470825195, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007050573825836182, \"percentile_inc_nulls\": 0.007668852806091309, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00680464506149292, \"percentile_inc_nulls\": 0.007423043251037598, \"value_count\": 15, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.006115972995758057, \"percentile_inc_nulls\": 0.0067348480224609375, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00590282678604126, \"percentile_inc_nulls\": 0.006521821022033691, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00570601224899292, \"percentile_inc_nulls\": 0.006325185298919678, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.005164921283721924, \"percentile_inc_nulls\": 0.005784392356872559, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0041811466217041016, \"percentile_inc_nulls\": 0.00480121374130249, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003885984420776367, \"percentile_inc_nulls\": 0.004506289958953857, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003623664379119873, \"percentile_inc_nulls\": 0.004244089126586914, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0035088658332824707, \"percentile_inc_nulls\": 0.0041294097900390625, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.002721846103668213, \"percentile_inc_nulls\": 0.003342866897583008, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0019840002059936523, \"percentile_inc_nulls\": 0.002605438232421875, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0012625455856323242, \"percentile_inc_nulls\": 0.00188446044921875, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0007214546203613281, \"percentile_inc_nulls\": 0.0013436675071716309, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0003935098648071289, \"percentile_inc_nulls\": 0.0010159611701965332, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0006226897239685059, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 38 values (0.1%) are null and there are 172 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10337, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 8460, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 4733, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 3103, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2318, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2125, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1952, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1743, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1595, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1516, \"group_name\": \"_state_\", \"value\": \"nv\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"s9\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"2a\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10337]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8892185091972351, \"percentile_inc_nulls\": 0.8892439603805542, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8651740550994873, \"percentile_inc_nulls\": 0.8652049899101257, \"value_count\": 1467, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1467.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8483904600143433, \"percentile_inc_nulls\": 0.8484252691268921, \"value_count\": 1024, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1024.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8343932628631592, \"percentile_inc_nulls\": 0.8344312310218811, \"value_count\": 854, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 854.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8206254243850708, \"percentile_inc_nulls\": 0.8206666111946106, \"value_count\": 840, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8072838187217712, \"percentile_inc_nulls\": 0.8073280453681946, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7944011092185974, \"percentile_inc_nulls\": 0.7944482564926147, \"value_count\": 786, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7821412086486816, \"percentile_inc_nulls\": 0.7821912169456482, \"value_count\": 748, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7701107859611511, \"percentile_inc_nulls\": 0.7701635360717773, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7582606673240662, \"percentile_inc_nulls\": 0.758316159248352, \"value_count\": 723, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 723.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7466564178466797, \"percentile_inc_nulls\": 0.7467144727706909, \"value_count\": 708, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 708.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7364616394042969, \"percentile_inc_nulls\": 0.7365221381187439, \"value_count\": 622, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 622.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7267422676086426, \"percentile_inc_nulls\": 0.7268049716949463, \"value_count\": 593, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 593.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7170720100402832, \"percentile_inc_nulls\": 0.7171369791030884, \"value_count\": 590, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7074673771858215, \"percentile_inc_nulls\": 0.7075344920158386, \"value_count\": 586, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6978955268859863, \"percentile_inc_nulls\": 0.6979647874832153, \"value_count\": 584, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.688372790813446, \"percentile_inc_nulls\": 0.6884442567825317, \"value_count\": 581, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6793090105056763, \"percentile_inc_nulls\": 0.6793825626373291, \"value_count\": 553, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 553.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6723759174346924, \"percentile_inc_nulls\": 0.6724510788917542, \"value_count\": 423, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6655576229095459, \"percentile_inc_nulls\": 0.6656343340873718, \"value_count\": 416, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6590179204940796, \"percentile_inc_nulls\": 0.6590961217880249, \"value_count\": 399, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6526912450790405, \"percentile_inc_nulls\": 0.6527709364891052, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6471513509750366, \"percentile_inc_nulls\": 0.6472322940826416, \"value_count\": 338, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6416934728622437, \"percentile_inc_nulls\": 0.6417756080627441, \"value_count\": 333, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6312692165374756, \"percentile_inc_nulls\": 0.6313538551330566, \"value_count\": 318, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.626384973526001, \"percentile_inc_nulls\": 0.626470685005188, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6217301487922668, \"percentile_inc_nulls\": 0.6218169331550598, \"value_count\": 284, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.612518846988678, \"percentile_inc_nulls\": 0.6126077175140381, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6079951524734497, \"percentile_inc_nulls\": 0.608085036277771, \"value_count\": 276, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.603569746017456, \"percentile_inc_nulls\": 0.6036607027053833, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5992263555526733, \"percentile_inc_nulls\": 0.5993183255195618, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5948993563652039, \"percentile_inc_nulls\": 0.5949922800064087, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5908181667327881, \"percentile_inc_nulls\": 0.5909121036529541, \"value_count\": 249, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5869500637054443, \"percentile_inc_nulls\": 0.5870448350906372, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5831148028373718, \"percentile_inc_nulls\": 0.5832104682922363, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.575706422328949, \"percentile_inc_nulls\": 0.5758037567138672, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5720186233520508, \"percentile_inc_nulls\": 0.5721167922019958, \"value_count\": 225, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5685439109802246, \"percentile_inc_nulls\": 0.5686428546905518, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5616599917411804, \"percentile_inc_nulls\": 0.5617605447769165, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.558234453201294, \"percentile_inc_nulls\": 0.5583357810974121, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5548253059387207, \"percentile_inc_nulls\": 0.554927408695221, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5514816641807556, \"percentile_inc_nulls\": 0.5515846014022827, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5482856035232544, \"percentile_inc_nulls\": 0.548389196395874, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5451222658157349, \"percentile_inc_nulls\": 0.5452266335487366, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5419917106628418, \"percentile_inc_nulls\": 0.5420968532562256, \"value_count\": 191, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5389431715011597, \"percentile_inc_nulls\": 0.5390489101409912, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5359601378440857, \"percentile_inc_nulls\": 0.5360665917396545, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.532993495464325, \"percentile_inc_nulls\": 0.5331006050109863, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5302727222442627, \"percentile_inc_nulls\": 0.5303804874420166, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5275847315788269, \"percentile_inc_nulls\": 0.5276931524276733, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5249786972999573, \"percentile_inc_nulls\": 0.5250876545906067, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5224218368530273, \"percentile_inc_nulls\": 0.5225313901901245, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5198976993560791, \"percentile_inc_nulls\": 0.520007848739624, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5173900127410889, \"percentile_inc_nulls\": 0.5175007581710815, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5148987174034119, \"percentile_inc_nulls\": 0.5150099992752075, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5124237537384033, \"percentile_inc_nulls\": 0.5125356316566467, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5099652409553528, \"percentile_inc_nulls\": 0.510077714920044, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5075395107269287, \"percentile_inc_nulls\": 0.5076524615287781, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5051465034484863, \"percentile_inc_nulls\": 0.5052600502967834, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4980167746543884, \"percentile_inc_nulls\": 0.4981319308280945, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48621582984924316, \"percentile_inc_nulls\": 0.4863336682319641, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4838883876800537, \"percentile_inc_nulls\": 0.4840068221092224, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48161017894744873, \"percentile_inc_nulls\": 0.48172909021377563, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47936469316482544, \"percentile_inc_nulls\": 0.4794841408729553, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47713565826416016, \"percentile_inc_nulls\": 0.47725558280944824, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4749557375907898, \"percentile_inc_nulls\": 0.47507619857788086, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47279220819473267, \"percentile_inc_nulls\": 0.4729132056236267, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4706615209579468, \"percentile_inc_nulls\": 0.47078293561935425, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4685635566711426, \"percentile_inc_nulls\": 0.468685507774353, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4664819836616516, \"percentile_inc_nulls\": 0.46660441160202026, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4644496440887451, \"percentile_inc_nulls\": 0.4645724892616272, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4624336361885071, \"percentile_inc_nulls\": 0.46255695819854736, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4604504108428955, \"percentile_inc_nulls\": 0.460574209690094, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45848357677459717, \"percentile_inc_nulls\": 0.4586077928543091, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45458269119262695, \"percentile_inc_nulls\": 0.4547078013420105, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4526650309562683, \"percentile_inc_nulls\": 0.45279061794281006, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45078015327453613, \"percentile_inc_nulls\": 0.4509061574935913, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.445223867893219, \"percentile_inc_nulls\": 0.44535118341445923, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4416508078575134, \"percentile_inc_nulls\": 0.44177889823913574, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4398806691169739, \"percentile_inc_nulls\": 0.4400091767311096, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43812692165374756, \"percentile_inc_nulls\": 0.43825584650039673, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43468499183654785, \"percentile_inc_nulls\": 0.4348146915435791, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4314069151878357, \"percentile_inc_nulls\": 0.431537389755249, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42978429794311523, \"percentile_inc_nulls\": 0.4299151301383972, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.428178071975708, \"percentile_inc_nulls\": 0.42830926179885864, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42499834299087524, \"percentile_inc_nulls\": 0.4251302480697632, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4234248995780945, \"percentile_inc_nulls\": 0.4235571622848511, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4218842387199402, \"percentile_inc_nulls\": 0.42201685905456543, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4203763008117676, \"percentile_inc_nulls\": 0.4205092787742615, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.418884813785553, \"percentile_inc_nulls\": 0.4190181493759155, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4145086407661438, \"percentile_inc_nulls\": 0.41464293003082275, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4131154417991638, \"percentile_inc_nulls\": 0.4132500886917114, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4103618860244751, \"percentile_inc_nulls\": 0.41049718856811523, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40764111280441284, \"percentile_inc_nulls\": 0.40777701139450073, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40495312213897705, \"percentile_inc_nulls\": 0.4050896167755127, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4022979140281677, \"percentile_inc_nulls\": 0.4024350047111511, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4009866714477539, \"percentile_inc_nulls\": 0.40112411975860596, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3983970284461975, \"percentile_inc_nulls\": 0.3985350728034973, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3971513509750366, \"percentile_inc_nulls\": 0.3972896933555603, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3934635519981384, \"percentile_inc_nulls\": 0.39360272884368896, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.39103782176971436, \"percentile_inc_nulls\": 0.39117753505706787, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3863174319267273, \"percentile_inc_nulls\": 0.38645821809768677, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38515371084213257, \"percentile_inc_nulls\": 0.3852947950363159, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38400644063949585, \"percentile_inc_nulls\": 0.3841477632522583, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38061362504959106, \"percentile_inc_nulls\": 0.3807557225227356, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3783845901489258, \"percentile_inc_nulls\": 0.3785271644592285, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3773028254508972, \"percentile_inc_nulls\": 0.37744569778442383, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3741067051887512, \"percentile_inc_nulls\": 0.37425029277801514, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3730577826499939, \"percentile_inc_nulls\": 0.3732016086578369, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.37202519178390503, \"percentile_inc_nulls\": 0.37216925621032715, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3710089921951294, \"percentile_inc_nulls\": 0.3711532950401306, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3690093755722046, \"percentile_inc_nulls\": 0.369154155254364, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3660591244697571, \"percentile_inc_nulls\": 0.36620455980300903, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.36130595207214355, \"percentile_inc_nulls\": 0.36145251989364624, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3594374656677246, \"percentile_inc_nulls\": 0.3595844507217407, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35851961374282837, \"percentile_inc_nulls\": 0.3586667776107788, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35671669244766235, \"percentile_inc_nulls\": 0.3568642735481262, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3549465537071228, \"percentile_inc_nulls\": 0.3550945520401001, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35147184133529663, \"percentile_inc_nulls\": 0.351620614528656, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3489149808883667, \"percentile_inc_nulls\": 0.34906435012817383, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.34724318981170654, \"percentile_inc_nulls\": 0.3473929166793823, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3439651131629944, \"percentile_inc_nulls\": 0.34411561489105225, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33994948863983154, \"percentile_inc_nulls\": 0.3401009440422058, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3344423770904541, \"percentile_inc_nulls\": 0.33459508419036865, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33213138580322266, \"percentile_inc_nulls\": 0.3322846293449402, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3291155695915222, \"percentile_inc_nulls\": 0.3292694687843323, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3261653184890747, \"percentile_inc_nulls\": 0.32631993293762207, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3211171627044678, \"percentile_inc_nulls\": 0.32127290964126587, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3168884515762329, \"percentile_inc_nulls\": 0.3170452117919922, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.31482332944869995, \"percentile_inc_nulls\": 0.31498050689697266, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3101193308830261, \"percentile_inc_nulls\": 0.3102775812149048, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30749690532684326, \"percentile_inc_nulls\": 0.3076557517051697, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3030223846435547, \"percentile_inc_nulls\": 0.3031822443008423, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30115389823913574, \"percentile_inc_nulls\": 0.30131417512893677, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2987281084060669, \"percentile_inc_nulls\": 0.2988889813423157, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2957778573036194, \"percentile_inc_nulls\": 0.29593944549560547, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.29348325729370117, \"percentile_inc_nulls\": 0.29364532232284546, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2884678244590759, \"percentile_inc_nulls\": 0.2886310815811157, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.285222589969635, \"percentile_inc_nulls\": 0.2853865623474121, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2831246256828308, \"percentile_inc_nulls\": 0.2832890748977661, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2770274877548218, \"percentile_inc_nulls\": 0.2771933078765869, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2735854983329773, \"percentile_inc_nulls\": 0.2737521529197693, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2726348638534546, \"percentile_inc_nulls\": 0.2728017568588257, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26942241191864014, \"percentile_inc_nulls\": 0.26959002017974854, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26632463932037354, \"percentile_inc_nulls\": 0.26649296283721924, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26078474521636963, \"percentile_inc_nulls\": 0.2609543204307556, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.25586771965026855, \"percentile_inc_nulls\": 0.2560384273529053, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24996721744537354, \"percentile_inc_nulls\": 0.2501392960548401, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24544352293014526, \"percentile_inc_nulls\": 0.245616614818573, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23895299434661865, \"percentile_inc_nulls\": 0.23912757635116577, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23551106452941895, \"percentile_inc_nulls\": 0.23568642139434814, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2273159623146057, \"percentile_inc_nulls\": 0.22749322652816772, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2207762598991394, \"percentile_inc_nulls\": 0.2209550142288208, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2142857313156128, \"percentile_inc_nulls\": 0.21446597576141357, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2109420895576477, \"percentile_inc_nulls\": 0.21112310886383057, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.20648396015167236, \"percentile_inc_nulls\": 0.2066659927368164, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.1981249451637268, \"percentile_inc_nulls\": 0.19830894470214844, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.19101160764694214, \"percentile_inc_nulls\": 0.1911972165107727, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.18227559328079224, \"percentile_inc_nulls\": 0.18246322870254517, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.17460501194000244, \"percentile_inc_nulls\": 0.17479437589645386, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.16559040546417236, \"percentile_inc_nulls\": 0.16578179597854614, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 550.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.15854257345199585, \"percentile_inc_nulls\": 0.15873563289642334, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.14836424589157104, \"percentile_inc_nulls\": 0.14855962991714478, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.13669443130493164, \"percentile_inc_nulls\": 0.13689249753952026, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.12717169523239136, \"percentile_inc_nulls\": 0.12737196683883667, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.11497735977172852, \"percentile_inc_nulls\": 0.11518043279647827, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.09989839792251587, \"percentile_inc_nulls\": 0.10010486841201782, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0844915509223938, \"percentile_inc_nulls\": 0.08470159769058228, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 940.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.06585592031478882, \"percentile_inc_nulls\": 0.06607019901275635, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.040352702140808105, \"percentile_inc_nulls\": 0.0405728816986084, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00022941827774047852, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2462.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14 values (0.0%) are null and there are 5121 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 6759, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1467, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1024, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 854, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 840, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 814, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 786, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 748, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 734, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 723, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"downes grove\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"l-1855 luxembourg\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"bnei-brak\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"beavercreek\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ft. myers,\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 6759]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9869236350059509, \"percentile_inc_nulls\": 0.9869236350059509, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9778782725334167, \"percentile_inc_nulls\": 0.9778782725334167, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9721266627311707, \"percentile_inc_nulls\": 0.9721266627311707, \"value_count\": 351, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9664405584335327, \"percentile_inc_nulls\": 0.9664405584335327, \"value_count\": 347, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9614754319190979, \"percentile_inc_nulls\": 0.9614754319190979, \"value_count\": 303, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9569363594055176, \"percentile_inc_nulls\": 0.9569363594055176, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9525775909423828, \"percentile_inc_nulls\": 0.9525775909423828, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9486120939254761, \"percentile_inc_nulls\": 0.9486120939254761, \"value_count\": 242, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9448267817497253, \"percentile_inc_nulls\": 0.9448267817497253, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9411562085151672, \"percentile_inc_nulls\": 0.9411562085151672, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.937616765499115, \"percentile_inc_nulls\": 0.937616765499115, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9341428279876709, \"percentile_inc_nulls\": 0.9341428279876709, \"value_count\": 212, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.930881917476654, \"percentile_inc_nulls\": 0.930881917476654, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9276373982429504, \"percentile_inc_nulls\": 0.9276373982429504, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9246222972869873, \"percentile_inc_nulls\": 0.9246222972869873, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9217710494995117, \"percentile_inc_nulls\": 0.9217710494995117, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9162651896476746, \"percentile_inc_nulls\": 0.9162651896476746, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9136925339698792, \"percentile_inc_nulls\": 0.9136925339698792, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9112181663513184, \"percentile_inc_nulls\": 0.9112181663513184, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9088093638420105, \"percentile_inc_nulls\": 0.9088093638420105, \"value_count\": 147, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9065152406692505, \"percentile_inc_nulls\": 0.9065152406692505, \"value_count\": 140, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9042539000511169, \"percentile_inc_nulls\": 0.9042539000511169, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9021236896514893, \"percentile_inc_nulls\": 0.9021236896514893, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9001245498657227, \"percentile_inc_nulls\": 0.9001245498657227, \"value_count\": 122, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8982564806938171, \"percentile_inc_nulls\": 0.8982564806938171, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8964048027992249, \"percentile_inc_nulls\": 0.8964048027992249, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8946186900138855, \"percentile_inc_nulls\": 0.8946186900138855, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8929636478424072, \"percentile_inc_nulls\": 0.8929636478424072, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8913413882255554, \"percentile_inc_nulls\": 0.8913413882255554, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8897355198860168, \"percentile_inc_nulls\": 0.8897355198860168, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8866220712661743, \"percentile_inc_nulls\": 0.8866220712661743, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8850981593132019, \"percentile_inc_nulls\": 0.8850981593132019, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8836561441421509, \"percentile_inc_nulls\": 0.8836561441421509, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8822305202484131, \"percentile_inc_nulls\": 0.8822305202484131, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8810015320777893, \"percentile_inc_nulls\": 0.8810015320777893, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8798708915710449, \"percentile_inc_nulls\": 0.8798708915710449, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.877707839012146, \"percentile_inc_nulls\": 0.877707839012146, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8766427636146545, \"percentile_inc_nulls\": 0.8766427636146545, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8756431937217712, \"percentile_inc_nulls\": 0.8756431937217712, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8746927380561829, \"percentile_inc_nulls\": 0.8746927380561829, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8737751245498657, \"percentile_inc_nulls\": 0.8737751245498657, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8728902339935303, \"percentile_inc_nulls\": 0.8728902339935303, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8711532950401306, \"percentile_inc_nulls\": 0.8711532950401306, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8703175783157349, \"percentile_inc_nulls\": 0.8703175783157349, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8686789274215698, \"percentile_inc_nulls\": 0.8686789274215698, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8678759932518005, \"percentile_inc_nulls\": 0.8678759932518005, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.866401195526123, \"percentile_inc_nulls\": 0.866401195526123, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8656966090202332, \"percentile_inc_nulls\": 0.8656966090202332, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8650083541870117, \"percentile_inc_nulls\": 0.8650083541870117, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8643528819084167, \"percentile_inc_nulls\": 0.8643528819084167, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.863074779510498, \"percentile_inc_nulls\": 0.863074779510498, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.861829400062561, \"percentile_inc_nulls\": 0.861829400062561, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8600596189498901, \"percentile_inc_nulls\": 0.8600596189498901, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8589125871658325, \"percentile_inc_nulls\": 0.8589125871658325, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.857241153717041, \"percentile_inc_nulls\": 0.857241153717041, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.855618953704834, \"percentile_inc_nulls\": 0.855618953704834, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8545701503753662, \"percentile_inc_nulls\": 0.8545701503753662, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8530462384223938, \"percentile_inc_nulls\": 0.8530462384223938, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8525546789169312, \"percentile_inc_nulls\": 0.8525546789169312, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8511290550231934, \"percentile_inc_nulls\": 0.8511290550231934, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8492937088012695, \"percentile_inc_nulls\": 0.8492937088012695, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8470815420150757, \"percentile_inc_nulls\": 0.8470815420150757, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8453773856163025, \"percentile_inc_nulls\": 0.8453773856163025, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8408710956573486, \"percentile_inc_nulls\": 0.8408710956573486, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8389047384262085, \"percentile_inc_nulls\": 0.8389047384262085, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8373971581459045, \"percentile_inc_nulls\": 0.8373971581459045, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8363156318664551, \"percentile_inc_nulls\": 0.8363156318664551, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8345950841903687, \"percentile_inc_nulls\": 0.8345950841903687, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8319732546806335, \"percentile_inc_nulls\": 0.8319732546806335, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.828859806060791, \"percentile_inc_nulls\": 0.828859806060791, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8262052536010742, \"percentile_inc_nulls\": 0.8262052536010742, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8250909447669983, \"percentile_inc_nulls\": 0.8250909447669983, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8222069144248962, \"percentile_inc_nulls\": 0.8222069144248962, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8190115690231323, \"percentile_inc_nulls\": 0.8190115690231323, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8148821592330933, \"percentile_inc_nulls\": 0.8148821592330933, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8112607598304749, \"percentile_inc_nulls\": 0.8112607598304749, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8055583238601685, \"percentile_inc_nulls\": 0.8055583238601685, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8001507520675659, \"percentile_inc_nulls\": 0.8001507520675659, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7937600612640381, \"percentile_inc_nulls\": 0.7937600612640381, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7862386703491211, \"percentile_inc_nulls\": 0.7862386703491211, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 459.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7773244380950928, \"percentile_inc_nulls\": 0.7773244380950928, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.767918586730957, \"percentile_inc_nulls\": 0.767918586730957, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 574.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.754645586013794, \"percentile_inc_nulls\": 0.754645586013794, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7358830571174622, \"percentile_inc_nulls\": 0.7358830571174622, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1145.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7013404369354248, \"percentile_inc_nulls\": 0.7013404369354248, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.6384655833244324, \"percentile_inc_nulls\": 0.6384655833244324, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3837.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.4765673875808716, \"percentile_inc_nulls\": 0.4765673875808716, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 9880.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29083.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 36703 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 351, \"group_name\": \"_street_address_\", \"value\": \"11 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 347, \"group_name\": \"_street_address_\", \"value\": \"383 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 303, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lk blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"85 broad st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 242, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"co wilmington trust company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial ctr floor 10\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"4450 belden vlg st nw\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"two jericho plz\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"970 lk carillon dr\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"250 vly blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"one north federal hwy\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 131,
+     "execution_count": 115,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+    "profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 116,
    "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
    "metadata": {},
    "outputs": [
@@ -1328,23 +954,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed {\n",
+       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed details,\n",
-       "  #altair-viz-397c6262cdc84c339cc4c3a1f6a11a12.vega-embed details summary {\n",
+       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed details,\n",
+       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\"></div>\n",
+       "<div id=\"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-397c6262cdc84c339cc4c3a1f6a11a12\");\n",
+       "    if (outputDiv.id !== \"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1390,20 +1016,20 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9993402361869812, \"percentile_inc_nulls\": 0.9993402361869812, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9982744455337524, \"percentile_inc_nulls\": 0.9982744455337524, \"value_count\": 21, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.99776691198349, \"percentile_inc_nulls\": 0.99776691198349, \"value_count\": 20, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9973101019859314, \"percentile_inc_nulls\": 0.9973101019859314, \"value_count\": 18, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9968787431716919, \"percentile_inc_nulls\": 0.9968787431716919, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9952292442321777, \"percentile_inc_nulls\": 0.9952292442321777, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9943157434463501, \"percentile_inc_nulls\": 0.9943157434463501, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9923617839813232, \"percentile_inc_nulls\": 0.9923617839813232, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 77.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9900779128074646, \"percentile_inc_nulls\": 0.9900779128074646, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9873372912406921, \"percentile_inc_nulls\": 0.9873372912406921, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9769837856292725, \"percentile_inc_nulls\": 0.9769837856292725, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 408.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9526479840278625, \"percentile_inc_nulls\": 0.9526479840278625, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 959.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.9031643867492676, \"percentile_inc_nulls\": 0.9031643867492676, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1950.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.825640082359314, \"percentile_inc_nulls\": 0.825640082359314, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3055.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.6853604316711426, \"percentile_inc_nulls\": 0.6853604316711426, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 5528.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.48103129863739014, \"percentile_inc_nulls\": 0.48103129863739014, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 8052.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.20037049055099487, \"percentile_inc_nulls\": 0.20037049055099487, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 11060.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 7896.0, \"distinct_value_count\": 18658}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 26, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 26.0, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 18658 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 26, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 21, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 21, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 20, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 18, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"valley ng power company limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"crossover wind limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lowell limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"mm tomoka farms energy limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"caron garden limited liability company\", \"total_non_null_rows\": 39407, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 18658}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 26]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8926863074302673, \"percentile_inc_nulls\": 0.9180095195770264, \"value_count\": 3231, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3231.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.8061645030975342, \"percentile_inc_nulls\": 0.8519045114517212, \"value_count\": 2605, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2605.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.7459479570388794, \"percentile_inc_nulls\": 0.8058974146842957, \"value_count\": 1813, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1813.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.692307710647583, \"percentile_inc_nulls\": 0.764914870262146, \"value_count\": 1615, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1615.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6463066339492798, \"percentile_inc_nulls\": 0.7297688126564026, \"value_count\": 1385, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1385.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.6083765029907227, \"percentile_inc_nulls\": 0.700789213180542, \"value_count\": 1142, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1142.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5708781480789185, \"percentile_inc_nulls\": 0.6721394658088684, \"value_count\": 1129, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1129.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5375980138778687, \"percentile_inc_nulls\": 0.6467125415802002, \"value_count\": 1002, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1002.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.5072405934333801, \"percentile_inc_nulls\": 0.623518705368042, \"value_count\": 914, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 914.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4776139259338379, \"percentile_inc_nulls\": 0.6008831262588501, \"value_count\": 892, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 892.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4506111145019531, \"percentile_inc_nulls\": 0.5802522301673889, \"value_count\": 813, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 813.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4274279475212097, \"percentile_inc_nulls\": 0.5625396370887756, \"value_count\": 698, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 698.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.4053075313568115, \"percentile_inc_nulls\": 0.5456390976905823, \"value_count\": 666, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 666.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.38355255126953125, \"percentile_inc_nulls\": 0.5290176868438721, \"value_count\": 655, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 655.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.36219608783721924, \"percentile_inc_nulls\": 0.5127007961273193, \"value_count\": 643, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 643.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.34253352880477905, \"percentile_inc_nulls\": 0.49767810106277466, \"value_count\": 592, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 592.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.3235684633255005, \"percentile_inc_nulls\": 0.483188271522522, \"value_count\": 571, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 571.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.30476951599121094, \"percentile_inc_nulls\": 0.4688253402709961, \"value_count\": 566, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 566.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2862362265586853, \"percentile_inc_nulls\": 0.4546654224395752, \"value_count\": 558, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 558.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.26853328943252563, \"percentile_inc_nulls\": 0.4411398768424988, \"value_count\": 533, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.251461386680603, \"percentile_inc_nulls\": 0.4280965328216553, \"value_count\": 514, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 514.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2351534366607666, \"percentile_inc_nulls\": 0.4156368374824524, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.21914440393447876, \"percentile_inc_nulls\": 0.40340548753738403, \"value_count\": 482, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 482.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.2044306993484497, \"percentile_inc_nulls\": 0.39216381311416626, \"value_count\": 443, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 443.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.19051414728164673, \"percentile_inc_nulls\": 0.3815311789512634, \"value_count\": 419, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 419.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.17686331272125244, \"percentile_inc_nulls\": 0.3711015582084656, \"value_count\": 411, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 411.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.16444134712219238, \"percentile_inc_nulls\": 0.36161088943481445, \"value_count\": 374, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1528165340423584, \"percentile_inc_nulls\": 0.3527292013168335, \"value_count\": 350, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 350.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1417563557624817, \"percentile_inc_nulls\": 0.3442789316177368, \"value_count\": 333, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.1311279535293579, \"percentile_inc_nulls\": 0.3361585736274719, \"value_count\": 320, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.12076526880264282, \"percentile_inc_nulls\": 0.32824116945266724, \"value_count\": 312, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 312.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.110801100730896, \"percentile_inc_nulls\": 0.32062828540802, \"value_count\": 300, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.10096985101699829, \"percentile_inc_nulls\": 0.31311696767807007, \"value_count\": 296, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 296.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.09253352880477905, \"percentile_inc_nulls\": 0.3066713809967041, \"value_count\": 254, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 254.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.08442938327789307, \"percentile_inc_nulls\": 0.30047959089279175, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.07685667276382446, \"percentile_inc_nulls\": 0.29469382762908936, \"value_count\": 228, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.06257474422454834, \"percentile_inc_nulls\": 0.28378206491470337, \"value_count\": 215, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.05616450309753418, \"percentile_inc_nulls\": 0.2788844704627991, \"value_count\": 193, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.050019919872283936, \"percentile_inc_nulls\": 0.27418988943099976, \"value_count\": 185, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.04483860731124878, \"percentile_inc_nulls\": 0.2702311873435974, \"value_count\": 156, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0397568941116333, \"percentile_inc_nulls\": 0.26634860038757324, \"value_count\": 153, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.03484123945236206, \"percentile_inc_nulls\": 0.2625929117202759, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.030324161052703857, \"percentile_inc_nulls\": 0.25914180278778076, \"value_count\": 136, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.025873541831970215, \"percentile_inc_nulls\": 0.2557413578033447, \"value_count\": 134, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 134.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.02172178030014038, \"percentile_inc_nulls\": 0.252569317817688, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.017868995666503906, \"percentile_inc_nulls\": 0.24962568283081055, \"value_count\": 116, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.014149069786071777, \"percentile_inc_nulls\": 0.2467835545539856, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.010728061199188232, \"percentile_inc_nulls\": 0.24416983127593994, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.008037745952606201, \"percentile_inc_nulls\": 0.2421143651008606, \"value_count\": 81, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0053806304931640625, \"percentile_inc_nulls\": 0.2400842308998108, \"value_count\": 80, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0031552910804748535, \"percentile_inc_nulls\": 0.23838406801223755, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.001926422119140625, \"percentile_inc_nulls\": 0.23744511604309082, \"value_count\": 37, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 37.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0011292695999145508, \"percentile_inc_nulls\": 0.23683607578277588, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.000597834587097168, \"percentile_inc_nulls\": 0.23643004894256592, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0003985762596130371, \"percentile_inc_nulls\": 0.23627781867980957, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 6.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.000298917293548584, \"percentile_inc_nulls\": 0.2362017035484314, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 9.965896606445312e-05, \"percentile_inc_nulls\": 0.23604941368103027, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 6.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.2359732985496521, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 63}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 3231, \"group_name\": \"_state_\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 3231.0, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 9,299 values (23.6%) are null and there are 63 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 3231, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2605, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1813, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1615, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1385, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1142, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1129, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1002, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 914, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 892, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"mp\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}, {\"value_count\": 2, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 30108, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 63}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 3231]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9577498435974121, \"percentile_inc_nulls\": 0.9728982448577881, \"value_count\": 1068, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.9291083216667175, \"percentile_inc_nulls\": 0.9545258283615112, \"value_count\": 724, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 724.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.9056096076965332, \"percentile_inc_nulls\": 0.9394524097442627, \"value_count\": 594, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8830999135971069, \"percentile_inc_nulls\": 0.9250133037567139, \"value_count\": 569, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 569.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.862963855266571, \"percentile_inc_nulls\": 0.9120968580245972, \"value_count\": 509, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 509.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8452804684638977, \"percentile_inc_nulls\": 0.9007536768913269, \"value_count\": 447, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 447.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8341245651245117, \"percentile_inc_nulls\": 0.8935976028442383, \"value_count\": 282, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 282.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8247883319854736, \"percentile_inc_nulls\": 0.8876088261604309, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8160851001739502, \"percentile_inc_nulls\": 0.8820260167121887, \"value_count\": 220, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8079752922058105, \"percentile_inc_nulls\": 0.876823902130127, \"value_count\": 205, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 205.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.8000237345695496, \"percentile_inc_nulls\": 0.8717232942581177, \"value_count\": 201, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 201.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7921908497810364, \"percentile_inc_nulls\": 0.8666988015174866, \"value_count\": 198, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7845557332038879, \"percentile_inc_nulls\": 0.8618012070655823, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7701558470726013, \"percentile_inc_nulls\": 0.85256427526474, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 364.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.763351559638977, \"percentile_inc_nulls\": 0.8481995463371277, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7567845582962036, \"percentile_inc_nulls\": 0.8439871072769165, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7502967119216919, \"percentile_inc_nulls\": 0.8398253917694092, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7442044019699097, \"percentile_inc_nulls\": 0.8359174728393555, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7384287118911743, \"percentile_inc_nulls\": 0.8322125673294067, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7328506708145142, \"percentile_inc_nulls\": 0.82863450050354, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7275892496109009, \"percentile_inc_nulls\": 0.8252594470977783, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7224463820457458, \"percentile_inc_nulls\": 0.8219605684280396, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7176991701126099, \"percentile_inc_nulls\": 0.8189154267311096, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.7130706310272217, \"percentile_inc_nulls\": 0.8159464001655579, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.703892707824707, \"percentile_inc_nulls\": 0.810059130191803, \"value_count\": 116, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 232.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6993433237075806, \"percentile_inc_nulls\": 0.8071408271789551, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6948334574699402, \"percentile_inc_nulls\": 0.8042479753494263, \"value_count\": 114, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6904423236846924, \"percentile_inc_nulls\": 0.8014312386512756, \"value_count\": 111, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6862488985061646, \"percentile_inc_nulls\": 0.798741340637207, \"value_count\": 106, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6779413223266602, \"percentile_inc_nulls\": 0.7934123277664185, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6738666296005249, \"percentile_inc_nulls\": 0.7907986044883728, \"value_count\": 103, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6698315143585205, \"percentile_inc_nulls\": 0.7882102131843567, \"value_count\": 102, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6658754348754883, \"percentile_inc_nulls\": 0.7856726050376892, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6581217050552368, \"percentile_inc_nulls\": 0.7806988954544067, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.654323935508728, \"percentile_inc_nulls\": 0.778262734413147, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6508030891418457, \"percentile_inc_nulls\": 0.7760042548179626, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6473218202590942, \"percentile_inc_nulls\": 0.7737711668014526, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6440383195877075, \"percentile_inc_nulls\": 0.7716649174690247, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 83.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6409130096435547, \"percentile_inc_nulls\": 0.769660234451294, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6378273963928223, \"percentile_inc_nulls\": 0.7676808834075928, \"value_count\": 78, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.634820818901062, \"percentile_inc_nulls\": 0.7657522559165955, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6318933367729187, \"percentile_inc_nulls\": 0.7638744115829468, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6261175870895386, \"percentile_inc_nulls\": 0.760169506072998, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6205000281333923, \"percentile_inc_nulls\": 0.7565661072731018, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6178890466690063, \"percentile_inc_nulls\": 0.7548912763595581, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6128253936767578, \"percentile_inc_nulls\": 0.7516431212425232, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6103330850601196, \"percentile_inc_nulls\": 0.7500444054603577, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.607959508895874, \"percentile_inc_nulls\": 0.7485218048095703, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.605823278427124, \"percentile_inc_nulls\": 0.7471514940261841, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.6037265658378601, \"percentile_inc_nulls\": 0.7458065748214722, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5997705459594727, \"percentile_inc_nulls\": 0.7432689666748047, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5978716611862183, \"percentile_inc_nulls\": 0.7420508861541748, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5922936797142029, \"percentile_inc_nulls\": 0.7384728193283081, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5904738903045654, \"percentile_inc_nulls\": 0.7373055219650269, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5835113525390625, \"percentile_inc_nulls\": 0.7328393459320068, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5784081220626831, \"percentile_inc_nulls\": 0.7295657992362976, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 129.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5750850439071655, \"percentile_inc_nulls\": 0.7274342179298401, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5734630823135376, \"percentile_inc_nulls\": 0.7263938188552856, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5655510425567627, \"percentile_inc_nulls\": 0.7213185429573059, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5609225034713745, \"percentile_inc_nulls\": 0.7183495759963989, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5594192743301392, \"percentile_inc_nulls\": 0.7173852324485779, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 38.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.552100658416748, \"percentile_inc_nulls\": 0.7126906514167786, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 185.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5464040040969849, \"percentile_inc_nulls\": 0.7090364694595337, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5436347723007202, \"percentile_inc_nulls\": 0.7072601318359375, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5395996570587158, \"percentile_inc_nulls\": 0.7046717405319214, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5356832146644592, \"percentile_inc_nulls\": 0.7021595239639282, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5293536186218262, \"percentile_inc_nulls\": 0.6980993151664734, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5232217311859131, \"percentile_inc_nulls\": 0.6941660046577454, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5196613669395447, \"percentile_inc_nulls\": 0.6918821334838867, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5139251947402954, \"percentile_inc_nulls\": 0.6882026195526123, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 145.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5072790384292603, \"percentile_inc_nulls\": 0.6839393973350525, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5051428079605103, \"percentile_inc_nulls\": 0.6825690865516663, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.5030856728553772, \"percentile_inc_nulls\": 0.6812494993209839, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 52.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.49814069271087646, \"percentile_inc_nulls\": 0.6780774593353271, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4943429231643677, \"percentile_inc_nulls\": 0.6756414175033569, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4888836145401001, \"percentile_inc_nulls\": 0.6721394658088684, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4819210171699524, \"percentile_inc_nulls\": 0.6676732301712036, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4719519019126892, \"percentile_inc_nulls\": 0.661278486251831, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.46562230587005615, \"percentile_inc_nulls\": 0.6572182178497314, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4626157283782959, \"percentile_inc_nulls\": 0.6552896499633789, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.45122241973876953, \"percentile_inc_nulls\": 0.6479812860488892, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4418070912361145, \"percentile_inc_nulls\": 0.6419417858123779, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.43357861042022705, \"percentile_inc_nulls\": 0.636663556098938, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4223039746284485, \"percentile_inc_nulls\": 0.6294313073158264, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 285.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.4117810130119324, \"percentile_inc_nulls\": 0.6226812601089478, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.402523934841156, \"percentile_inc_nulls\": 0.6167432069778442, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3925548195838928, \"percentile_inc_nulls\": 0.6103484034538269, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.37775933742523193, \"percentile_inc_nulls\": 0.6008577346801758, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 374.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.36153966188430786, \"percentile_inc_nulls\": 0.5904535055160522, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 410.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3426694869995117, \"percentile_inc_nulls\": 0.5783489942550659, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 477.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3220982551574707, \"percentile_inc_nulls\": 0.5651533603668213, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 520.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.3018830418586731, \"percentile_inc_nulls\": 0.552186131477356, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.27696019411087036, \"percentile_inc_nulls\": 0.536199152469635, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 630.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.234433114528656, \"percentile_inc_nulls\": 0.5089197158813477, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1075.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.18537861108779907, \"percentile_inc_nulls\": 0.47745323181152344, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1240.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.13161641359329224, \"percentile_inc_nulls\": 0.4429669976234436, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1359.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.07259279489517212, \"percentile_inc_nulls\": 0.40510571002960205, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1492.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.3585403561592102, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1835.0, \"distinct_value_count\": 4225}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1068, \"group_name\": \"_city_\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1068.0, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14,129 values (35.9%) are null and there are 4225 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1068, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 724, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 594, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 569, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 509, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 447, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 282, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 236, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 220, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 205, \"group_name\": \"_city_\", \"value\": \"omaha\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"stafford springs\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"frisco\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"forth worth\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"tooele\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"munster\", \"total_non_null_rows\": 25278, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 4225}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 1068]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9842325448989868, \"percentile_inc_nulls\": 0.9920572638511658, \"value_count\": 313, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9770288467407227, \"percentile_inc_nulls\": 0.9884284734725952, \"value_count\": 143, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9700770974159241, \"percentile_inc_nulls\": 0.9849265217781067, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9637297987937927, \"percentile_inc_nulls\": 0.9817291498184204, \"value_count\": 126, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9592967629432678, \"percentile_inc_nulls\": 0.9794960021972656, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9553171396255493, \"percentile_inc_nulls\": 0.9774913191795349, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9518412351608276, \"percentile_inc_nulls\": 0.975740373134613, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9484156966209412, \"percentile_inc_nulls\": 0.9740147590637207, \"value_count\": 68, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9450909495353699, \"percentile_inc_nulls\": 0.972339928150177, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9419676661491394, \"percentile_inc_nulls\": 0.9707666039466858, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9389955401420593, \"percentile_inc_nulls\": 0.9692693948745728, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9361745119094849, \"percentile_inc_nulls\": 0.9678483605384827, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9334542155265808, \"percentile_inc_nulls\": 0.9664780497550964, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9307843446731567, \"percentile_inc_nulls\": 0.9651330709457397, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9256460666656494, \"percentile_inc_nulls\": 0.9625447392463684, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9232280254364014, \"percentile_inc_nulls\": 0.9613266587257385, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9184927940368652, \"percentile_inc_nulls\": 0.9589412808418274, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9161754846572876, \"percentile_inc_nulls\": 0.9577739834785461, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9139086008071899, \"percentile_inc_nulls\": 0.9566320776939392, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.909475564956665, \"percentile_inc_nulls\": 0.9543989896774292, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.9055463075637817, \"percentile_inc_nulls\": 0.952419638633728, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8998035192489624, \"percentile_inc_nulls\": 0.9495267271995544, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8960757851600647, \"percentile_inc_nulls\": 0.9476488828659058, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8925495147705078, \"percentile_inc_nulls\": 0.9458725452423096, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8891239762306213, \"percentile_inc_nulls\": 0.944146990776062, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8874616026878357, \"percentile_inc_nulls\": 0.9433095455169678, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8842375874519348, \"percentile_inc_nulls\": 0.9416854977607727, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8811143040657043, \"percentile_inc_nulls\": 0.9401121735572815, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8796030282974243, \"percentile_inc_nulls\": 0.9393509030342102, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.875371515750885, \"percentile_inc_nulls\": 0.9372192621231079, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8644904494285583, \"percentile_inc_nulls\": 0.931738018989563, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.860561192035675, \"percentile_inc_nulls\": 0.9297586679458618, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8530048727989197, \"percentile_inc_nulls\": 0.9259522557258606, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8469598293304443, \"percentile_inc_nulls\": 0.9229071140289307, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8434839248657227, \"percentile_inc_nulls\": 0.9211561679840088, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8390509486198425, \"percentile_inc_nulls\": 0.918923020362854, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8337615132331848, \"percentile_inc_nulls\": 0.9162585139274597, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8297314643859863, \"percentile_inc_nulls\": 0.9142284393310547, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8259029984474182, \"percentile_inc_nulls\": 0.9122998714447021, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8213691711425781, \"percentile_inc_nulls\": 0.9100160002708435, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8119490146636963, \"percentile_inc_nulls\": 0.9052706360816956, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 187.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.8046950101852417, \"percentile_inc_nulls\": 0.9016164541244507, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.799405574798584, \"percentile_inc_nulls\": 0.8989519476890564, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7923530340194702, \"percentile_inc_nulls\": 0.895399272441864, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7831847071647644, \"percentile_inc_nulls\": 0.8907808065414429, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7716991901397705, \"percentile_inc_nulls\": 0.8849950432777405, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7633872032165527, \"percentile_inc_nulls\": 0.8808079957962036, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7528084516525269, \"percentile_inc_nulls\": 0.875478982925415, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7396605014801025, \"percentile_inc_nulls\": 0.8688557744026184, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 261.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7235403656959534, \"percentile_inc_nulls\": 0.8607354164123535, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 320.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.7062616348266602, \"percentile_inc_nulls\": 0.8520313501358032, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6808724999427795, \"percentile_inc_nulls\": 0.8392417430877686, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 504.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6536698341369629, \"percentile_inc_nulls\": 0.8255386352539062, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 540.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.6141756176948547, \"percentile_inc_nulls\": 0.8056436777114868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 784.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.559921383857727, \"percentile_inc_nulls\": 0.7783135175704956, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1077.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.4305576682090759, \"percentile_inc_nulls\": 0.7131474018096924, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2568.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.4962570071220398, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 8547.0, \"distinct_value_count\": 10892}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 313, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 313.0, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 19,556 values (49.6%) are null and there are 10892 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 313, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 143, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st, suite 200\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 138, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd.\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 126, \"group_name\": \"_street_address_\", \"value\": \"130 roberts street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 88, \"group_name\": \"_street_address_\", \"value\": \"333 washington street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king street\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 69, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave., 35th fl.\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 68, \"group_name\": \"_street_address_\", \"value\": \"101 summer street, 2nd floor\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"14302 fnb parkway\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 62, \"group_name\": \"_street_address_\", \"value\": \"66 york street, 5th floor\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"14700 downey avenue\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"80 vandenburgh avenue\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"32982 road 80\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"401 f street, nw\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"9600 sw barnes rd\", \"total_non_null_rows\": 19851, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 10892}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 313]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9763555526733398, \"percentile_inc_nulls\": 0.9850280284881592, \"value_count\": 590, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9581212401390076, \"percentile_inc_nulls\": 0.9734818935394287, \"value_count\": 455, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 455.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9498657584190369, \"percentile_inc_nulls\": 0.9682543873786926, \"value_count\": 206, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9417705535888672, \"percentile_inc_nulls\": 0.963128387928009, \"value_count\": 202, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 202.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9337554574012756, \"percentile_inc_nulls\": 0.9580531120300293, \"value_count\": 200, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9258606433868408, \"percentile_inc_nulls\": 0.9530540108680725, \"value_count\": 197, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 197.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9186069965362549, \"percentile_inc_nulls\": 0.9484609365463257, \"value_count\": 181, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9114735722541809, \"percentile_inc_nulls\": 0.943943977355957, \"value_count\": 178, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 178.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.9044603705406189, \"percentile_inc_nulls\": 0.9395031332969666, \"value_count\": 175, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8975273370742798, \"percentile_inc_nulls\": 0.9351130723953247, \"value_count\": 173, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 173.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8906744718551636, \"percentile_inc_nulls\": 0.9307737350463867, \"value_count\": 171, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 171.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8843024969100952, \"percentile_inc_nulls\": 0.9267389178276062, \"value_count\": 159, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.878010630607605, \"percentile_inc_nulls\": 0.9227548241615295, \"value_count\": 157, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8724802732467651, \"percentile_inc_nulls\": 0.9192529320716858, \"value_count\": 138, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8674708604812622, \"percentile_inc_nulls\": 0.916080892086029, \"value_count\": 125, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8627419471740723, \"percentile_inc_nulls\": 0.9130865335464478, \"value_count\": 118, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 118.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8580932021141052, \"percentile_inc_nulls\": 0.9101428985595703, \"value_count\": 116, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8498376607894897, \"percentile_inc_nulls\": 0.9049153923988342, \"value_count\": 103, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8458702564239502, \"percentile_inc_nulls\": 0.9024031162261963, \"value_count\": 99, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8419428467750549, \"percentile_inc_nulls\": 0.8999162912368774, \"value_count\": 98, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8380956053733826, \"percentile_inc_nulls\": 0.8974801301956177, \"value_count\": 96, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8343285322189331, \"percentile_inc_nulls\": 0.8950947523117065, \"value_count\": 94, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8306015133857727, \"percentile_inc_nulls\": 0.8927347660064697, \"value_count\": 93, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.826994776725769, \"percentile_inc_nulls\": 0.8904509544372559, \"value_count\": 90, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8234280347824097, \"percentile_inc_nulls\": 0.8881924748420715, \"value_count\": 89, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8201418519020081, \"percentile_inc_nulls\": 0.8861116170883179, \"value_count\": 82, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8169759511947632, \"percentile_inc_nulls\": 0.8841068744659424, \"value_count\": 79, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8108043074607849, \"percentile_inc_nulls\": 0.8801989555358887, \"value_count\": 77, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8078387379646301, \"percentile_inc_nulls\": 0.87832111120224, \"value_count\": 74, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 74.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8049933910369873, \"percentile_inc_nulls\": 0.8765193819999695, \"value_count\": 71, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.8021880984306335, \"percentile_inc_nulls\": 0.8747430443763733, \"value_count\": 70, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7971386313438416, \"percentile_inc_nulls\": 0.871545672416687, \"value_count\": 63, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7946539521217346, \"percentile_inc_nulls\": 0.8699723482131958, \"value_count\": 62, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7900052070617676, \"percentile_inc_nulls\": 0.8670287132263184, \"value_count\": 58, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7855969071388245, \"percentile_inc_nulls\": 0.8642373085021973, \"value_count\": 55, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7792249321937561, \"percentile_inc_nulls\": 0.8602024912834167, \"value_count\": 53, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7729731798171997, \"percentile_inc_nulls\": 0.8562438488006592, \"value_count\": 52, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7709293365478516, \"percentile_inc_nulls\": 0.8549495935440063, \"value_count\": 51, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7669218182563782, \"percentile_inc_nulls\": 0.8524119853973389, \"value_count\": 50, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7629944086074829, \"percentile_inc_nulls\": 0.84992516040802, \"value_count\": 49, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7591471672058105, \"percentile_inc_nulls\": 0.8474889993667603, \"value_count\": 48, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7553800940513611, \"percentile_inc_nulls\": 0.8451036810874939, \"value_count\": 47, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7535366415977478, \"percentile_inc_nulls\": 0.8439363241195679, \"value_count\": 46, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7481265068054199, \"percentile_inc_nulls\": 0.8405105471611023, \"value_count\": 45, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7445998191833496, \"percentile_inc_nulls\": 0.8382774591445923, \"value_count\": 44, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.741153359413147, \"percentile_inc_nulls\": 0.8360950946807861, \"value_count\": 43, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7394702434539795, \"percentile_inc_nulls\": 0.8350293040275574, \"value_count\": 42, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7361840009689331, \"percentile_inc_nulls\": 0.8329484462738037, \"value_count\": 41, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 82.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.731374979019165, \"percentile_inc_nulls\": 0.8299033045768738, \"value_count\": 40, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7282490730285645, \"percentile_inc_nulls\": 0.8279239535331726, \"value_count\": 39, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7236804962158203, \"percentile_inc_nulls\": 0.8250311017036438, \"value_count\": 38, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.7133010029792786, \"percentile_inc_nulls\": 0.8184586763381958, \"value_count\": 37, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 259.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.70753014087677, \"percentile_inc_nulls\": 0.8148044943809509, \"value_count\": 36, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6963090896606445, \"percentile_inc_nulls\": 0.8076991438865662, \"value_count\": 35, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 280.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6935839653015137, \"percentile_inc_nulls\": 0.8059735298156738, \"value_count\": 34, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6896164417266846, \"percentile_inc_nulls\": 0.8034613132476807, \"value_count\": 33, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6832044124603271, \"percentile_inc_nulls\": 0.7994011044502258, \"value_count\": 32, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6807197332382202, \"percentile_inc_nulls\": 0.7978277802467346, \"value_count\": 31, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6771129369735718, \"percentile_inc_nulls\": 0.795543909072876, \"value_count\": 30, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6724642515182495, \"percentile_inc_nulls\": 0.7926002740859985, \"value_count\": 29, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 116.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.669097900390625, \"percentile_inc_nulls\": 0.790468692779541, \"value_count\": 28, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6647697687149048, \"percentile_inc_nulls\": 0.7877280712127686, \"value_count\": 27, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6574760675430298, \"percentile_inc_nulls\": 0.7831096053123474, \"value_count\": 26, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6504628658294678, \"percentile_inc_nulls\": 0.7786687612533569, \"value_count\": 25, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 175.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6418066024780273, \"percentile_inc_nulls\": 0.773187518119812, \"value_count\": 24, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6353544592857361, \"percentile_inc_nulls\": 0.7691019177436829, \"value_count\": 23, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 161.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6318278312683105, \"percentile_inc_nulls\": 0.7668688297271729, \"value_count\": 22, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6242536306381226, \"percentile_inc_nulls\": 0.762072741985321, \"value_count\": 21, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6194445490837097, \"percentile_inc_nulls\": 0.7590276002883911, \"value_count\": 20, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.61411452293396, \"percentile_inc_nulls\": 0.7556525468826294, \"value_count\": 19, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6112291216850281, \"percentile_inc_nulls\": 0.7538254261016846, \"value_count\": 18, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.6030537486076355, \"percentile_inc_nulls\": 0.7486487030982971, \"value_count\": 17, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5966416597366333, \"percentile_inc_nulls\": 0.7445884943008423, \"value_count\": 16, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.587624728679657, \"percentile_inc_nulls\": 0.7388788461685181, \"value_count\": 15, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5758426189422607, \"percentile_inc_nulls\": 0.731418251991272, \"value_count\": 14, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 294.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5669859647750854, \"percentile_inc_nulls\": 0.725810170173645, \"value_count\": 13, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5568869113922119, \"percentile_inc_nulls\": 0.7194153070449829, \"value_count\": 12, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5414579510688782, \"percentile_inc_nulls\": 0.7096455097198486, \"value_count\": 11, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5262293219566345, \"percentile_inc_nulls\": 0.7000025510787964, \"value_count\": 10, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 380.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.5081954002380371, \"percentile_inc_nulls\": 0.688583254814148, \"value_count\": 9, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.48639440536499023, \"percentile_inc_nulls\": 0.6747785806655884, \"value_count\": 8, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.4659159183502197, \"percentile_inc_nulls\": 0.661811351776123, \"value_count\": 7, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 511.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.4298481345176697, \"percentile_inc_nulls\": 0.6389727592468262, \"value_count\": 6, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 900.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.3877689838409424, \"percentile_inc_nulls\": 0.6123277544975281, \"value_count\": 5, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1050.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.3178775906562805, \"percentile_inc_nulls\": 0.5680716633796692, \"value_count\": 4, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 1744.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.2271069884300232, \"percentile_inc_nulls\": 0.5105946063995361, \"value_count\": 3, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2265.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.11842262744903564, \"percentile_inc_nulls\": 0.44177430868148804, \"value_count\": 2, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2712.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.3667876124382019, \"value_count\": 1, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 2955.0, \"distinct_value_count\": 6401}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 590, \"group_name\": \"_zip_code_\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"zip_code\\\"\", \"subtitle\": \"In this col, 14,454 values (36.7%) are null and there are 6401 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 590, \"group_name\": \"_zip_code_\", \"value\": \"33408\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 455, \"group_name\": \"_zip_code_\", \"value\": \"77002\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 206, \"group_name\": \"_zip_code_\", \"value\": \"01810\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 202, \"group_name\": \"_zip_code_\", \"value\": \"28801\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 200, \"group_name\": \"_zip_code_\", \"value\": \"10017\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 197, \"group_name\": \"_zip_code_\", \"value\": \"27517\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 181, \"group_name\": \"_zip_code_\", \"value\": \"77056\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 178, \"group_name\": \"_zip_code_\", \"value\": \"07302\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 175, \"group_name\": \"_zip_code_\", \"value\": \"02110\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 173, \"group_name\": \"_zip_code_\", \"value\": \"37201\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"06076\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"10029\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"93206\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"08536\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}, {\"value_count\": 1, \"group_name\": \"_zip_code_\", \"value\": \"86301\", \"total_non_null_rows\": 24953, \"total_rows_inc_nulls\": 39407, \"distinct_value_count\": 6401}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 590]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9991835355758667, \"percentile_inc_nulls\": 0.9991835355758667, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9984630942344666, \"percentile_inc_nulls\": 0.9984630942344666, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9978387355804443, \"percentile_inc_nulls\": 0.9978387355804443, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9972623586654663, \"percentile_inc_nulls\": 0.9972623586654663, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.996734082698822, \"percentile_inc_nulls\": 0.996734082698822, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 11.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9952932000160217, \"percentile_inc_nulls\": 0.9952932000160217, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9944286942481995, \"percentile_inc_nulls\": 0.9944286942481995, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9917390942573547, \"percentile_inc_nulls\": 0.9917390942573547, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9883770942687988, \"percentile_inc_nulls\": 0.9883770942687988, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9765621423721313, \"percentile_inc_nulls\": 0.9765621423721313, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9439027905464172, \"percentile_inc_nulls\": 0.9439027905464172, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.8651361465454102, \"percentile_inc_nulls\": 0.8651361465454102, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1640.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.725949764251709, \"percentile_inc_nulls\": 0.725949764251709, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2898.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.4753373861312866, \"percentile_inc_nulls\": 0.4753373861312866, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5218.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 14086 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 15, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 12, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 11, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"10 briggs solar ng limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"3880 north mission road solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"41mb 8me limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"4c acquisition limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"59fed wham8 solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 17]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0012776851654052734, \"percentile_inc_nulls\": 0.02391815185546875, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0004914402961730957, \"percentile_inc_nulls\": 0.023149728775024414, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0003439784049987793, \"percentile_inc_nulls\": 0.02300560474395752, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.00024569034576416016, \"percentile_inc_nulls\": 0.022909581661224365, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.022669434547424316, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.8766032457351685, \"percentile_inc_nulls\": 0.8794006109237671, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7900633811950684, \"percentile_inc_nulls\": 0.7948225140571594, \"value_count\": 1761, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1761.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7252445220947266, \"percentile_inc_nulls\": 0.7314730286598206, \"value_count\": 1319, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1319.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6654381155967712, \"percentile_inc_nulls\": 0.6730223894119263, \"value_count\": 1217, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1217.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6124134063720703, \"percentile_inc_nulls\": 0.6211997270584106, \"value_count\": 1079, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1079.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5711337327957153, \"percentile_inc_nulls\": 0.5808558464050293, \"value_count\": 840, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5326551795005798, \"percentile_inc_nulls\": 0.5432496070861816, \"value_count\": 783, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.49756747484207153, \"percentile_inc_nulls\": 0.5089572668075562, \"value_count\": 714, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4656739830970764, \"percentile_inc_nulls\": 0.4777868390083313, \"value_count\": 649, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4348616600036621, \"percentile_inc_nulls\": 0.4476730227470398, \"value_count\": 627, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.410388708114624, \"percentile_inc_nulls\": 0.42375487089157104, \"value_count\": 498, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 498.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3862597942352295, \"percentile_inc_nulls\": 0.4001728892326355, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3641948103904724, \"percentile_inc_nulls\": 0.3786081075668335, \"value_count\": 449, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3430144190788269, \"percentile_inc_nulls\": 0.35790789127349854, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3237505555152893, \"percentile_inc_nulls\": 0.3390807509422302, \"value_count\": 392, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.30483072996139526, \"percentile_inc_nulls\": 0.3205897808074951, \"value_count\": 385, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2861074209213257, \"percentile_inc_nulls\": 0.30229097604751587, \"value_count\": 381, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2678264379501343, \"percentile_inc_nulls\": 0.284424364566803, \"value_count\": 372, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2511671185493469, \"percentile_inc_nulls\": 0.2681427597999573, \"value_count\": 339, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.235687255859375, \"percentile_inc_nulls\": 0.2530137896537781, \"value_count\": 315, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2217307686805725, \"percentile_inc_nulls\": 0.23937368392944336, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.20811831951141357, \"percentile_inc_nulls\": 0.22606980800628662, \"value_count\": 277, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.19470244646072388, \"percentile_inc_nulls\": 0.21295809745788574, \"value_count\": 273, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.1813356876373291, \"percentile_inc_nulls\": 0.19989430904388428, \"value_count\": 272, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.15548676252365112, \"percentile_inc_nulls\": 0.17463135719299316, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.14349597692489624, \"percentile_inc_nulls\": 0.16291242837905884, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.13199663162231445, \"percentile_inc_nulls\": 0.1516737937927246, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.12148016691207886, \"percentile_inc_nulls\": 0.1413956880569458, \"value_count\": 214, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.11145508289337158, \"percentile_inc_nulls\": 0.1315978765487671, \"value_count\": 204, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.10162663459777832, \"percentile_inc_nulls\": 0.12199223041534424, \"value_count\": 200, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.09219127893447876, \"percentile_inc_nulls\": 0.11277073621749878, \"value_count\": 192, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.08457416296005249, \"percentile_inc_nulls\": 0.10532635450363159, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07730108499526978, \"percentile_inc_nulls\": 0.09821814298629761, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07012629508972168, \"percentile_inc_nulls\": 0.09120601415634155, \"value_count\": 146, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.06373775005340576, \"percentile_inc_nulls\": 0.08496230840682983, \"value_count\": 130, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.05759495496749878, \"percentile_inc_nulls\": 0.07895874977111816, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.052091002464294434, \"percentile_inc_nulls\": 0.07357954978942871, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0471767783164978, \"percentile_inc_nulls\": 0.06877672672271729, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.042459070682525635, \"percentile_inc_nulls\": 0.06416600942611694, \"value_count\": 96, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0380362868309021, \"percentile_inc_nulls\": 0.05984342098236084, \"value_count\": 90, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.033662617206573486, \"percentile_inc_nulls\": 0.0555688738822937, \"value_count\": 89, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.029436349868774414, \"percentile_inc_nulls\": 0.05143845081329346, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.02555406093597412, \"percentile_inc_nulls\": 0.047644197940826416, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.021720945835113525, \"percentile_inc_nulls\": 0.04389798641204834, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.018526732921600342, \"percentile_inc_nulls\": 0.04077613353729248, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.01572561264038086, \"percentile_inc_nulls\": 0.03803849220275879, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.013071894645690918, \"percentile_inc_nulls\": 0.03544497489929199, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.010467350482940674, \"percentile_inc_nulls\": 0.03289949893951416, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.006142795085906982, \"percentile_inc_nulls\": 0.028672993183135986, \"value_count\": 44, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.004078805446624756, \"percentile_inc_nulls\": 0.02665579319000244, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.002457141876220703, \"percentile_inc_nulls\": 0.02507084608078003, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 472 values (2.3%) are null and there are 62 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2511, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1761, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1319, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1217, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1079, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 840, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 783, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 714, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 649, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 627, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2511]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9553115367889404, \"percentile_inc_nulls\": 0.9554296135902405, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.9231917858123779, \"percentile_inc_nulls\": 0.9233946800231934, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.89848792552948, \"percentile_inc_nulls\": 0.8987560868263245, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8761436939239502, \"percentile_inc_nulls\": 0.8764708638191223, \"value_count\": 464, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8545699715614319, \"percentile_inc_nulls\": 0.8549541234970093, \"value_count\": 448, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 448.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8359819054603577, \"percentile_inc_nulls\": 0.83641517162323, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8250987529754639, \"percentile_inc_nulls\": 0.8255607485771179, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.815178632736206, \"percentile_inc_nulls\": 0.8156668543815613, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8057401180267334, \"percentile_inc_nulls\": 0.8062533140182495, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7966387271881104, \"percentile_inc_nulls\": 0.797175943851471, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7878744006156921, \"percentile_inc_nulls\": 0.7884347438812256, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7795916199684143, \"percentile_inc_nulls\": 0.7801738977432251, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7714051604270935, \"percentile_inc_nulls\": 0.772009015083313, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7633631825447083, \"percentile_inc_nulls\": 0.7639882564544678, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7555619478225708, \"percentile_inc_nulls\": 0.7562077045440674, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7479052543640137, \"percentile_inc_nulls\": 0.7485711574554443, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7406337261199951, \"percentile_inc_nulls\": 0.7413188219070435, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7338438034057617, \"percentile_inc_nulls\": 0.7345468401908875, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7271501421928406, \"percentile_inc_nulls\": 0.7278709411621094, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7211307287216187, \"percentile_inc_nulls\": 0.7218673229217529, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.715207576751709, \"percentile_inc_nulls\": 0.7159598469734192, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7093807458877563, \"percentile_inc_nulls\": 0.7101483941078186, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7039391398429871, \"percentile_inc_nulls\": 0.704721212387085, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6933448910713196, \"percentile_inc_nulls\": 0.6941549777984619, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6880959272384644, \"percentile_inc_nulls\": 0.6889198422431946, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.677983283996582, \"percentile_inc_nulls\": 0.6788338422775269, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6729750633239746, \"percentile_inc_nulls\": 0.6738389134407043, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6682076454162598, \"percentile_inc_nulls\": 0.6690840721130371, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6634883880615234, \"percentile_inc_nulls\": 0.6643773317337036, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6589136123657227, \"percentile_inc_nulls\": 0.6598145961761475, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6501492857933044, \"percentile_inc_nulls\": 0.6510734558105469, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6459115743637085, \"percentile_inc_nulls\": 0.6468468904495239, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6420109868049622, \"percentile_inc_nulls\": 0.6429566144943237, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6383511424064636, \"percentile_inc_nulls\": 0.6393064260482788, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6313204169273376, \"percentile_inc_nulls\": 0.6322942972183228, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6243860125541687, \"percentile_inc_nulls\": 0.6253782510757446, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6212077140808105, \"percentile_inc_nulls\": 0.6222083568572998, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6180776357650757, \"percentile_inc_nulls\": 0.619086503982544, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6150919795036316, \"percentile_inc_nulls\": 0.616108775138855, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.612250804901123, \"percentile_inc_nulls\": 0.6132750511169434, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6094577312469482, \"percentile_inc_nulls\": 0.6104893684387207, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6041606664657593, \"percentile_inc_nulls\": 0.6052062511444092, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5990561246871948, \"percentile_inc_nulls\": 0.6001152992248535, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5940479636192322, \"percentile_inc_nulls\": 0.5951203107833862, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.591592013835907, \"percentile_inc_nulls\": 0.5926708579063416, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5891842842102051, \"percentile_inc_nulls\": 0.5902694463729858, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5869209170341492, \"percentile_inc_nulls\": 0.5880120992660522, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5847057700157166, \"percentile_inc_nulls\": 0.5858027935028076, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5825387239456177, \"percentile_inc_nulls\": 0.583641529083252, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5783973932266235, \"percentile_inc_nulls\": 0.5795110464096069, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5743522644042969, \"percentile_inc_nulls\": 0.5754766464233398, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.568429172039032, \"percentile_inc_nulls\": 0.5695691704750061, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5626504421234131, \"percentile_inc_nulls\": 0.5638057589530945, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5607724189758301, \"percentile_inc_nulls\": 0.5619326829910278, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5536453723907471, \"percentile_inc_nulls\": 0.5548244714736938, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5519117712974548, \"percentile_inc_nulls\": 0.5530954599380493, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5485408902168274, \"percentile_inc_nulls\": 0.5497334003448486, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.537416934967041, \"percentile_inc_nulls\": 0.5386388301849365, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5327939987182617, \"percentile_inc_nulls\": 0.534028172492981, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5253298282623291, \"percentile_inc_nulls\": 0.526583731174469, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.523885190486908, \"percentile_inc_nulls\": 0.5251429080963135, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5210921764373779, \"percentile_inc_nulls\": 0.5223572254180908, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5156987309455872, \"percentile_inc_nulls\": 0.5169780254364014, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5104979276657104, \"percentile_inc_nulls\": 0.5117909908294678, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5067417621612549, \"percentile_inc_nulls\": 0.5080447196960449, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4995184540748596, \"percentile_inc_nulls\": 0.5008404850959778, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49720698595046997, \"percentile_inc_nulls\": 0.49853515625, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49056148529052734, \"percentile_inc_nulls\": 0.49190717935562134, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4852643609046936, \"percentile_inc_nulls\": 0.4866240620613098, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.47616297006607056, \"percentile_inc_nulls\": 0.47754669189453125, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46845805644989014, \"percentile_inc_nulls\": 0.469862163066864, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46388328075408936, \"percentile_inc_nulls\": 0.46529942750930786, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4595492482185364, \"percentile_inc_nulls\": 0.46097689867019653, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4489068388938904, \"percentile_inc_nulls\": 0.4503626227378845, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.44120198488235474, \"percentile_inc_nulls\": 0.4426780939102173, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.43325626850128174, \"percentile_inc_nulls\": 0.4347533583641052, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.42044687271118164, \"percentile_inc_nulls\": 0.42197781801223755, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4135606288909912, \"percentile_inc_nulls\": 0.4151097536087036, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4025811553001404, \"percentile_inc_nulls\": 0.4041592478752136, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.3919869065284729, \"percentile_inc_nulls\": 0.3935930132865906, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.37465089559555054, \"percentile_inc_nulls\": 0.3763027787208557, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.35991525650024414, \"percentile_inc_nulls\": 0.36160606145858765, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.33757102489471436, \"percentile_inc_nulls\": 0.3393208980560303, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.31869399547576904, \"percentile_inc_nulls\": 0.32049375772476196, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2932678461074829, \"percentile_inc_nulls\": 0.2951347231864929, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.25739187002182007, \"percentile_inc_nulls\": 0.25935351848602295, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 745.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2086583971977234, \"percentile_inc_nulls\": 0.2107487916946411, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.15621691942214966, \"percentile_inc_nulls\": 0.15844577550888062, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.09313303232192993, \"percentile_inc_nulls\": 0.09552854299545288, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1310.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026415586471557617, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1934.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 55 values (0.3%) are null and there are 3879 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 928, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 667, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 513, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 464, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 448, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 386, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 226, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 206, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 196, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 189, \"group_name\": \"_city_\", \"value\": \"nashville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ft. washington\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"yadkinville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"east longmeadow\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"lebo\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"londonderry\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 928]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.7171125411987305, \"percentile_inc_nulls\": 0.7171125411987305, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6966043710708618, \"percentile_inc_nulls\": 0.6966043710708618, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6680755019187927, \"percentile_inc_nulls\": 0.6680755019187927, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6394985914230347, \"percentile_inc_nulls\": 0.6394985914230347, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6006916165351868, \"percentile_inc_nulls\": 0.6006916165351868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 808.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.548388659954071, \"percentile_inc_nulls\": 0.548388659954071, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.43811535835266113, \"percentile_inc_nulls\": 0.43811535835266113, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2296.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9122.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9712309837341309, \"percentile_inc_nulls\": 0.9712309837341309, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.964026689529419, \"percentile_inc_nulls\": 0.964026689529419, \"value_count\": 150, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9597522020339966, \"percentile_inc_nulls\": 0.9597522020339966, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9559579491615295, \"percentile_inc_nulls\": 0.9559579491615295, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9490418434143066, \"percentile_inc_nulls\": 0.9490418434143066, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.942702054977417, \"percentile_inc_nulls\": 0.942702054977417, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9365544319152832, \"percentile_inc_nulls\": 0.9365544319152832, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9335286617279053, \"percentile_inc_nulls\": 0.9335286617279053, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9305508732795715, \"percentile_inc_nulls\": 0.9305508732795715, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9277172088623047, \"percentile_inc_nulls\": 0.9277172088623047, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9251717329025269, \"percentile_inc_nulls\": 0.9251717329025269, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9202728271484375, \"percentile_inc_nulls\": 0.9202728271484375, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9179674386978149, \"percentile_inc_nulls\": 0.9179674386978149, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9111953973770142, \"percentile_inc_nulls\": 0.9111953973770142, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9089860916137695, \"percentile_inc_nulls\": 0.9089860916137695, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9068248271942139, \"percentile_inc_nulls\": 0.9068248271942139, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9047116041183472, \"percentile_inc_nulls\": 0.9047116041183472, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9027424454689026, \"percentile_inc_nulls\": 0.9027424454689026, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8989001512527466, \"percentile_inc_nulls\": 0.8989001512527466, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8915998339653015, \"percentile_inc_nulls\": 0.8915998339653015, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.886268675327301, \"percentile_inc_nulls\": 0.886268675327301, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8829066753387451, \"percentile_inc_nulls\": 0.8829066753387451, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8813217282295227, \"percentile_inc_nulls\": 0.8813217282295227, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8738773465156555, \"percentile_inc_nulls\": 0.8738773465156555, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8709956407546997, \"percentile_inc_nulls\": 0.8709956407546997, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8696027994155884, \"percentile_inc_nulls\": 0.8696027994155884, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8628787994384766, \"percentile_inc_nulls\": 0.8628787994384766, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8563950061798096, \"percentile_inc_nulls\": 0.8563950061798096, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8526487350463867, \"percentile_inc_nulls\": 0.8526487350463867, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8490466475486755, \"percentile_inc_nulls\": 0.8490466475486755, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8455885648727417, \"percentile_inc_nulls\": 0.8455885648727417, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8411699533462524, \"percentile_inc_nulls\": 0.8411699533462524, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8401133418083191, \"percentile_inc_nulls\": 0.8401133418083191, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8380961418151855, \"percentile_inc_nulls\": 0.8380961418151855, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8332933187484741, \"percentile_inc_nulls\": 0.8332933187484741, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.828730583190918, \"percentile_inc_nulls\": 0.828730583190918, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8218145370483398, \"percentile_inc_nulls\": 0.8218145370483398, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8136496543884277, \"percentile_inc_nulls\": 0.8136496543884277, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8082705140113831, \"percentile_inc_nulls\": 0.8082705140113831, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.800345778465271, \"percentile_inc_nulls\": 0.800345778465271, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7956390380859375, \"percentile_inc_nulls\": 0.7956390380859375, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7844003438949585, \"percentile_inc_nulls\": 0.7844003438949585, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7728735208511353, \"percentile_inc_nulls\": 0.7728735208511353, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7633638978004456, \"percentile_inc_nulls\": 0.7633638978004456, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7494356632232666, \"percentile_inc_nulls\": 0.7494356632232666, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7351711988449097, \"percentile_inc_nulls\": 0.7351711988449097, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 11403 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 425, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 174, \"group_name\": \"_street_address_\", \"value\": \"130 roberts st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 150, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st suite 200\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 89, \"group_name\": \"_street_address_\", \"value\": \"333 washington st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave 35th fl\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"222 2nd ave south suite 1900\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"50101 governors dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"101 summer st 2nd floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 64, \"group_name\": \"_street_address_\", \"value\": \"66 york st 5th floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"300 spectrum ctr dr ste1020\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1310 mackie rd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"26w271 durfree st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"180 hbr dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"964 lebanon church rd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 425]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 132,
+     "execution_count": 116,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "profile_columns(eia_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
+    "profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)"
    ]
   },
   {
@@ -1415,35 +1041,28 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "5d0b403f-8a1a-4ee2-89db-f274f6a55bbd",
-   "metadata": {},
-   "source": [
-    "TODO: import BLOCKING RULES from config"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'number_of_comparisons_generated_pre_filter_conditions': 988101,\n",
-       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 988101,\n",
+       "{'number_of_comparisons_generated_pre_filter_conditions': 487944,\n",
+       " 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,\n",
        " 'filter_conditions_identified': '',\n",
        " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n",
        " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
       ]
      },
-     "execution_count": 67,
+     "execution_count": 117,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# useful for experimenting with a new blocking rule\n",
     "counts = count_comparisons_from_blocking_rule(\n",
     "    table_or_tables=[sec_match_df, eia_match_df],\n",
     "    blocking_rule=BLOCKING_RULES[0],\n",
@@ -1457,7 +1076,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 118,
    "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
    "metadata": {},
    "outputs": [
@@ -1491,24 +1110,24 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>AMRK</td>\n",
-       "      <td>888</td>\n",
-       "      <td>85</td>\n",
-       "      <td>75480</td>\n",
+       "      <td>INTR</td>\n",
+       "      <td>445</td>\n",
+       "      <td>76</td>\n",
+       "      <td>33820</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>INTR</td>\n",
-       "      <td>468</td>\n",
-       "      <td>157</td>\n",
-       "      <td>73476</td>\n",
+       "      <td>AMRK</td>\n",
+       "      <td>851</td>\n",
+       "      <td>38</td>\n",
+       "      <td>32338</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>FRST</td>\n",
-       "      <td>836</td>\n",
-       "      <td>82</td>\n",
-       "      <td>68552</td>\n",
+       "      <td>816</td>\n",
+       "      <td>36</td>\n",
+       "      <td>29376</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1516,12 +1135,12 @@
       ],
       "text/plain": [
        "  key_0  count_l  count_r  block_count\n",
-       "0  AMRK      888       85        75480\n",
-       "1  INTR      468      157        73476\n",
-       "2  FRST      836       82        68552"
+       "0  INTR      445       76        33820\n",
+       "1  AMRK      851       38        32338\n",
+       "2  FRST      816       36        29376"
       ]
      },
-     "execution_count": 68,
+     "execution_count": 118,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1540,46 +1159,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 121,
    "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bf1ed000055946dcbdc2d64e635de891",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "data": {
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed {\n",
+       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed details,\n",
-       "  #altair-viz-7144afd26472470d8fe5764a8949ebb8.vega-embed details summary {\n",
+       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed details,\n",
+       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-7144afd26472470d8fe5764a8949ebb8\"></div>\n",
+       "<div id=\"altair-viz-7213d070f2cd42878025324dddfeb43b\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-7144afd26472470d8fe5764a8949ebb8\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-7144afd26472470d8fe5764a8949ebb8\");\n",
+       "    if (outputDiv.id !== \"altair-viz-7213d070f2cd42878025324dddfeb43b\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7213d070f2cd42878025324dddfeb43b\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1625,14 +1230,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-8fc653ccc17479a7e2943968c5585e30\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-8fc653ccc17479a7e2943968c5585e30\": [{\"blocking_rule\": \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\", \"row_count\": 988101, \"cumulative_rows\": 988101, \"cartesian\": 2542342605, \"match_key\": \"0\", \"start\": 0}, {\"blocking_rule\": \"l.street_address = r.street_address\", \"row_count\": 9184, \"cumulative_rows\": 997285, \"cartesian\": 2542342605, \"match_key\": \"1\", \"start\": 988101}, {\"blocking_rule\": \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\", \"row_count\": 13507, \"cumulative_rows\": 1010792, \"cartesian\": 2542342605, \"match_key\": \"2\", \"start\": 997285}, {\"blocking_rule\": \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\", \"row_count\": 27665, \"cumulative_rows\": 1038457, \"cartesian\": 2542342605, \"match_key\": \"3\", \"start\": 1010792}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-c9738a9d1eccd268140e81d460277b02\"}, \"mark\": \"bar\", \"encoding\": {\"order\": {\"field\": \"cumulative_rows\"}, \"tooltip\": [{\"field\": \"blocking_rule\", \"title\": \"SQL Condition\", \"type\": \"nominal\"}, {\"field\": \"row_count\", \"format\": \",\", \"title\": \"Comparisons Generated\", \"type\": \"quantitative\"}, {\"field\": \"cumulative_rows\", \"format\": \",\", \"title\": \"Cumulative Comparisons\", \"type\": \"quantitative\"}, {\"field\": \"cartesian\", \"format\": \",\", \"title\": \"Total comparisons in Cartesian product\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"start\", \"title\": \"Comparisons Generated by Rule(s)\", \"type\": \"quantitative\"}, \"x2\": {\"field\": \"cumulative_rows\"}, \"y\": {\"field\": \"blocking_rule\", \"sort\": [\"-x2\"], \"title\": \"SQL Blocking Rule\"}}, \"height\": {\"step\": 20}, \"title\": {\"text\": \"Count of Additional Comparisons Generated by Each Blocking Rule\", \"subtitle\": \"(Counts exclude comparisons already generated by previous rules)\"}, \"width\": 450, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-c9738a9d1eccd268140e81d460277b02\": [{\"blocking_rule\": \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\", \"row_count\": 487944, \"cumulative_rows\": 487944, \"cartesian\": 1270622346, \"match_key\": \"0\", \"start\": 0}, {\"blocking_rule\": \"l.street_address = r.street_address\", \"row_count\": 13016, \"cumulative_rows\": 500960, \"cartesian\": 1270622346, \"match_key\": \"1\", \"start\": 487944}, {\"blocking_rule\": \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city\", \"row_count\": 89615, \"cumulative_rows\": 590575, \"cartesian\": 1270622346, \"match_key\": \"2\", \"start\": 500960}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.Chart(...)"
       ]
      },
-     "execution_count": 69,
+     "execution_count": 121,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1655,14 +1260,6 @@
     "## Create Model"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "d35162e9-f671-4e99-a261-e1bd4d16717e",
-   "metadata": {},
-   "source": [
-    "TODO: import comparisons from config"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 334,
@@ -1688,7 +1285,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 386,
+   "execution_count": 422,
+   "id": "d2e043ed-7f64-4547-992d-7f947a63db6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOT USED\n",
+    "address_comparison = cl.CustomComparison(\n",
+    "    comparison_levels = [\n",
+    "        cll.NullLevel(\"street_address\"),\n",
+    "        cll.ExactMatchLevel(\"street_address\"),\n",
+    "        cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n",
+    "        cll.ArraySubsetLevel(\"street_address_list\"),\n",
+    "    ],\n",
+    "    output_column_name=\"street_address\",\n",
+    "    comparison_description=None\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
    "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
    "metadata": {},
    "outputs": [
@@ -1707,22 +1324,12 @@
     }
    ],
    "source": [
-    "company_name_comparison = cl.NameComparison(\n",
-    "    \"company_name_no_legal\",\n",
-    "    jaro_winkler_thresholds=[.95],\n",
-    ")\n",
-    "\"\"\"\n",
-    "company_name_comparison = cl.JaccardAtThresholds(\n",
-    "     \"company_name\",\n",
-    "    # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n",
-    ")\n",
-    "\"\"\"\n",
     "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 449,
+   "execution_count": 123,
    "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
    "metadata": {},
    "outputs": [
@@ -1741,58 +1348,36 @@
     }
    ],
    "source": [
-    "address_comparison = cl.LevenshteinAtThresholds(\n",
-    "    \"street_address\",\n",
-    "    distance_threshold_or_thresholds=[1]\n",
-    ").configure(term_frequency_adjustments=True)\n",
     "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 422,
-   "id": "d2e043ed-7f64-4547-992d-7f947a63db6d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# NOT USED\n",
-    "address_comparison = cl.CustomComparison(\n",
-    "    comparison_levels = [\n",
-    "        cll.NullLevel(\"street_address\"),\n",
-    "        cll.ExactMatchLevel(\"street_address\"),\n",
-    "        cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n",
-    "        cll.ArraySubsetLevel(\"street_address_list\"),\n",
-    "    ],\n",
-    "    output_column_name=\"street_address\",\n",
-    "    comparison_description=None\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 388,
-   "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Use state and city instead of zip code\n",
-    "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 450,
-   "id": "974a3982-38a1-45cb-9875-b8d4584c808d",
+   "execution_count": 124,
+   "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Comparison 'ExactMatch' of \"state\".\n",
+      "Similarity is assessed using the following ComparisonLevels:\n",
+      "    - 'state is NULL' with SQL rule: \"state_l\" IS NULL OR \"state_r\" IS NULL\n",
+      "    - 'Exact match on state' with SQL rule: \"state_l\" = \"state_r\"\n",
+      "    - 'All other comparisons' with SQL rule: ELSE\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "state_comparison = cl.ExactMatch(\"state\").configure(term_frequency_adjustments=True)"
+    "print(state_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 451,
-   "id": "7592619b-340a-4496-8195-9ce932cae699",
+   "execution_count": 125,
+   "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6",
    "metadata": {},
    "outputs": [
     {
@@ -1810,16 +1395,12 @@
     }
    ],
    "source": [
-    "city_comparison = cl.NameComparison(\n",
-    "    \"city\",\n",
-    "    jaro_winkler_thresholds=[0.9]\n",
-    ")\n",
     "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 452,
+   "execution_count": 126,
    "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
    "metadata": {},
    "outputs": [],
@@ -1830,30 +1411,19 @@
     "    comparisons=[\n",
     "        company_name_comparison,\n",
     "        address_comparison,\n",
-    "        # zip_code_comparison,\n",
     "        state_comparison,\n",
     "        city_comparison\n",
     "    ],\n",
-    "    blocking_rules_to_generate_predictions=[\n",
-    "        BLOCKING_RULES\n",
-    "    ],\n",
+    "    blocking_rules_to_generate_predictions=BLOCKING_RULES,\n",
     "    retain_intermediate_calculation_columns=True,\n",
     ")\n",
     "\n",
     "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "04fda31f-fcea-446e-813a-08617d7a43bf",
-   "metadata": {},
-   "source": [
-    "TODO: import deterministic rules"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 453,
+   "execution_count": 127,
    "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
    "metadata": {},
    "outputs": [
@@ -1861,26 +1431,18 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Probability two random records match is estimated to be  1.78e-06.\n",
-      "This means that amongst all possible pairwise record comparisons, one in 562,858.42 are expected to match.  With 2,542,342,605 total possible comparisons, we expect a total of around 4,516.84 matching pairs\n"
+      "Probability two random records match is estimated to be  2.37e-06.\n",
+      "This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match.  With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs\n"
      ]
     }
    ],
    "source": [
-    "deterministic_rules = [\n",
-    "    block_on(\"company_name_mphone\", \"company_name_mphone\"),\n",
-    "    # block_on(\"street_address\"),\n",
-    "    \"jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city\",\n",
-    "    # \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and jaccard(r.street_address, l.street_address) >= .9\",\n",
-    "    \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address\",\n",
-    "]\n",
-    "\n",
-    "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)"
+    "linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 128,
    "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
    "metadata": {},
    "outputs": [
@@ -1890,6 +1452,48 @@
      "text": [
       "----- Estimating u probabilities using random sampling -----\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4e8733639644336a9a29f9b599af513",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2fc66d179b9a430795b4ec68a164c22e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Estimated u probabilities using random sampling\n",
+      "\n",
+      "Your model is not yet fully trained. Missing estimates for:\n",
+      "    - company_name_no_legal (no m values are trained).\n",
+      "    - street_address (no m values are trained).\n",
+      "    - state (no m values are trained).\n",
+      "    - city (no m values are trained).\n"
+     ]
     }
    ],
    "source": [
@@ -1898,7 +1502,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 427,
+   "execution_count": 129,
    "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
    "metadata": {},
    "outputs": [
@@ -1926,12 +1530,13 @@
       "WARNING:\n",
       "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n",
       "\n",
-      "Iteration 1: Largest change in params was -0.347 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 2: Largest change in params was 0.307 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 3: Largest change in params was 0.0403 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 4: Largest change in params was 4.46e-05 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n",
+      "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n",
+      "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.000537 in probability_two_random_records_match\n",
+      "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n",
       "\n",
-      "EM converged after 4 iterations\n",
+      "EM converged after 5 iterations\n",
       "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n",
       "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n",
       "\n",
@@ -1949,7 +1554,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 428,
+   "execution_count": 130,
    "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
    "metadata": {},
    "outputs": [
@@ -1971,21 +1576,19 @@
       "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n",
       "    - street_address\n",
       "\n",
-      "Iteration 1: Largest change in params was -0.395 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 2: Largest change in params was 0.889 in the m_probability of company_name_no_legal, level `All other comparisons`\n",
-      "Iteration 3: Largest change in params was 0.285 in probability_two_random_records_match\n",
-      "Iteration 4: Largest change in params was 0.0152 in probability_two_random_records_match\n",
-      "Iteration 5: Largest change in params was 0.048 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 6: Largest change in params was 0.0559 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 7: Largest change in params was 0.0205 in probability_two_random_records_match\n",
-      "Iteration 8: Largest change in params was 0.00696 in probability_two_random_records_match\n",
-      "Iteration 9: Largest change in params was 0.0024 in probability_two_random_records_match\n",
-      "Iteration 10: Largest change in params was 0.000849 in probability_two_random_records_match\n",
-      "Iteration 11: Largest change in params was 0.000305 in probability_two_random_records_match\n",
-      "Iteration 12: Largest change in params was 0.00011 in probability_two_random_records_match\n",
-      "Iteration 13: Largest change in params was 3.98e-05 in probability_two_random_records_match\n",
+      "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n",
+      "Iteration 2: Largest change in params was 0.477 in probability_two_random_records_match\n",
+      "Iteration 3: Largest change in params was 0.0395 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.0443 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 5: Largest change in params was 0.0195 in probability_two_random_records_match\n",
+      "Iteration 6: Largest change in params was 0.00733 in probability_two_random_records_match\n",
+      "Iteration 7: Largest change in params was 0.00275 in probability_two_random_records_match\n",
+      "Iteration 8: Largest change in params was 0.00105 in probability_two_random_records_match\n",
+      "Iteration 9: Largest change in params was 0.0004 in probability_two_random_records_match\n",
+      "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n",
+      "Iteration 11: Largest change in params was 5.9e-05 in probability_two_random_records_match\n",
       "\n",
-      "EM converged after 13 iterations\n",
+      "EM converged after 11 iterations\n",
       "\n",
       "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n"
      ]
@@ -2000,8 +1603,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 429,
-   "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580",
+   "execution_count": 131,
+   "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13",
    "metadata": {},
    "outputs": [
     {
@@ -2009,23 +1612,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed {\n",
+       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed details,\n",
-       "  #altair-viz-b091c03ea16e42ce928edfa6f14bcf09.vega-embed details summary {\n",
+       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed details,\n",
+       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\"></div>\n",
+       "<div id=\"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-b091c03ea16e42ce928edfa6f14bcf09\");\n",
+       "    if (outputDiv.id !== \"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2071,14 +1674,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-486b54dc4323abf2383382ed2927fd87\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-486b54dc4323abf2383382ed2927fd87\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 1.7766488754200009e-06, \"log2_bayes_factor\": -19.10240998404316, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  562,858.4 records.This is equivalent to a starting match weight of -19.102.\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"m_probability_description\": \"Amongst matching record comparisons, 100% of records (i.e. one in 1) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001146% of records (i.e. one in 872,346) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2225% of records (i.e. one in 449) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.656e-05% of records (i.e. one in 2,734,922) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9859795613823598, \"u_probability\": 0.9999984880242627, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.985981052161788, \"log2_bayes_factor\": -0.0203681726400408, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"m_probability_description\": \"Amongst matching record comparisons, 75.49% of records (i.e. one in 1.325) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 13.96% of records (i.e. one in 7.163) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.07843139336913232, \"u_probability\": 0.2336448598130841, \"m_probability_description\": \"Amongst matching record comparisons, 7.843% of records (i.e. one in 12.75) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 23.36% of records (i.e. one in 4.28) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.33568636361988635, \"log2_bayes_factor\": -1.5748141623724388, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 2.979 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"m_probability_description\": \"Amongst matching record comparisons, 16.67% of records (i.e. one in 6) are in the array subset comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 62.68% of records (i.e. one in 1.596) are in the array subset comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"m_probability_description\": \"Amongst matching record comparisons, 70.4% of records (i.e. one in 1.42) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.739% of records (i.e. one in 21.1) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2959745498553683, \"u_probability\": 0.9526111145925285, \"m_probability_description\": \"Amongst matching record comparisons, 29.6% of records (i.e. one in 3.379) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.26% of records (i.e. one in 1.05) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.3106981908162692, \"log2_bayes_factor\": -1.6864142541614218, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.219 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"m_probability_description\": \"Amongst matching record comparisons, 57.71% of records (i.e. one in 1.733) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6087% of records (i.e. one in 164) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.027155350274657646, \"u_probability\": 0.0004625318884682651, \"m_probability_description\": \"Amongst matching record comparisons, 2.716% of records (i.e. one in 36.83) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04625% of records (i.e. one in 2,162) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 58.710222909356894, \"log2_bayes_factor\": 5.875539829168419, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 58.71 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.39579349044407397, \"u_probability\": 0.9934509050534991, \"m_probability_description\": \"Amongst matching record comparisons, 39.58% of records (i.e. one in 2.527) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.35% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.39840266733941904, \"log2_bayes_factor\": -1.327700788484204, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.51 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-64b98266126531a5fb88840b22d4f48f\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-64b98266126531a5fb88840b22d4f48f\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.3743083676072958e-06, \"log2_bayes_factor\": -18.684061249539493, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  421,176.3 records.This is equivalent to a starting match weight of -18.684.\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 7.756e-05% of records (i.e. one in 1,289,362) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.002325464819968982, \"u_probability\": 4.0767514280959686e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.077e-05% of records (i.e. one in 2,452,933) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5704.210474895404, \"log2_bayes_factor\": 12.477811500224687, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 5,704 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001162% of records (i.e. one in 86,031) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916612, \"u_probability\": 2.0015855182334595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002002% of records (i.e. one in 49,960) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 425.7988077590869, \"log2_bayes_factor\": 8.734028100010068, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 426 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816566831130892, \"log2_bayes_factor\": -0.18171111483340682, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.112% of records (i.e. one in 19.56) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.188553543221709, \"u_probability\": 0.9488813638600871, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.304) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871139891995104, \"log2_bayes_factor\": -2.3312534608825977, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6331% of records (i.e. one in 158) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021461003233482275, \"u_probability\": 0.0005131439788678606, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05131% of records (i.e. one in 1,949) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 41.822576347541414, \"log2_bayes_factor\": 5.386210032217432, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 41.82 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29455979898988677, \"u_probability\": 0.993155557685305, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.32% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.29658979070348424, \"log2_bayes_factor\": -1.7534591570912872, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.372 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 429,
+     "execution_count": 131,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2089,8 +1692,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 430,
-   "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6",
+   "execution_count": 132,
+   "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3",
    "metadata": {},
    "outputs": [
     {
@@ -2098,23 +1701,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed {\n",
+       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed details,\n",
-       "  #altair-viz-502ff82b439845389349212cfd7a7eb0.vega-embed details summary {\n",
+       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed details,\n",
+       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-502ff82b439845389349212cfd7a7eb0\"></div>\n",
+       "<div id=\"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-502ff82b439845389349212cfd7a7eb0\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-502ff82b439845389349212cfd7a7eb0\");\n",
+       "    if (outputDiv.id !== \"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -2160,14 +1763,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-98b48f38ee96425504d9a9c7a3e99480\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-98b48f38ee96425504d9a9c7a3e99480\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"m_probability_description\": \"Amongst matching record comparisons, 100% of records (i.e. one in 1) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001146% of records (i.e. one in 872,346) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2225% of records (i.e. one in 449) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.656e-05% of records (i.e. one in 2,734,922) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9859795613823598, \"u_probability\": 0.9999984880242627, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.985981052161788, \"log2_bayes_factor\": -0.0203681726400408, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"m_probability_description\": \"Amongst matching record comparisons, 75.49% of records (i.e. one in 1.325) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 13.96% of records (i.e. one in 7.163) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.07843139336913232, \"u_probability\": 0.2336448598130841, \"m_probability_description\": \"Amongst matching record comparisons, 7.843% of records (i.e. one in 12.75) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 23.36% of records (i.e. one in 4.28) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.33568636361988635, \"log2_bayes_factor\": -1.5748141623724388, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 2.979 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"m_probability_description\": \"Amongst matching record comparisons, 16.67% of records (i.e. one in 6) are in the array subset comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 62.68% of records (i.e. one in 1.596) are in the array subset comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"m_probability_description\": \"Amongst matching record comparisons, 70.4% of records (i.e. one in 1.42) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.739% of records (i.e. one in 21.1) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2959745498553683, \"u_probability\": 0.9526111145925285, \"m_probability_description\": \"Amongst matching record comparisons, 29.6% of records (i.e. one in 3.379) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 95.26% of records (i.e. one in 1.05) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.3106981908162692, \"log2_bayes_factor\": -1.6864142541614218, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.219 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"m_probability_description\": \"Amongst matching record comparisons, 57.71% of records (i.e. one in 1.733) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6087% of records (i.e. one in 164) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.027155350274657646, \"u_probability\": 0.0004625318884682651, \"m_probability_description\": \"Amongst matching record comparisons, 2.716% of records (i.e. one in 36.83) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04625% of records (i.e. one in 2,162) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 58.710222909356894, \"log2_bayes_factor\": 5.875539829168419, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 58.71 times more likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.39579349044407397, \"u_probability\": 0.9934509050534991, \"m_probability_description\": \"Amongst matching record comparisons, 39.58% of records (i.e. one in 2.527) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.35% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.39840266733941904, \"log2_bayes_factor\": -1.327700788484204, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 2.51 times less likely to be a match\", \"probability_two_random_records_match\": 1.7766457189443821e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-dc74bccc7251002cb1499c8a0408d184\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-dc74bccc7251002cb1499c8a0408d184\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 7.756e-05% of records (i.e. one in 1,289,362) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.002325464819968982, \"u_probability\": 4.0767514280959686e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.077e-05% of records (i.e. one in 2,452,933) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5704.210474895404, \"log2_bayes_factor\": 12.477811500224687, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 5,704 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001162% of records (i.e. one in 86,031) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916612, \"u_probability\": 2.0015855182334595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002002% of records (i.e. one in 49,960) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 425.7988077590869, \"log2_bayes_factor\": 8.734028100010068, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 426 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816566831130892, \"log2_bayes_factor\": -0.18171111483340682, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.112% of records (i.e. one in 19.56) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.188553543221709, \"u_probability\": 0.9488813638600871, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.304) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871139891995104, \"log2_bayes_factor\": -2.3312534608825977, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6331% of records (i.e. one in 158) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021461003233482275, \"u_probability\": 0.0005131439788678606, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05131% of records (i.e. one in 1,949) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 41.822576347541414, \"log2_bayes_factor\": 5.386210032217432, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 41.82 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29455979898988677, \"u_probability\": 0.993155557685305, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.32% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.29658979070348424, \"log2_bayes_factor\": -1.7534591570912872, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.372 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.HConcatChart(...)"
       ]
      },
-     "execution_count": 430,
+     "execution_count": 132,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2198,30 +1801,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 431,
+   "execution_count": 133,
    "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
    "metadata": {},
    "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "3ce1c0af73694400974ca6253619dd5b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Blocking time: 9.73 seconds\n",
-      "Predict time: 0.52 seconds\n"
+      "Blocking time: 0.16 seconds\n",
+      "Predict time: 0.31 seconds\n"
      ]
     }
    ],
@@ -2233,7 +1822,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 432,
+   "execution_count": 134,
    "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
    "metadata": {},
    "outputs": [],
@@ -2243,7 +1832,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 433,
+   "execution_count": 135,
    "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
    "metadata": {},
    "outputs": [
@@ -2283,10 +1872,11 @@
        "      <th>bf_tf_adj_company_name_no_legal</th>\n",
        "      <th>street_address_l</th>\n",
        "      <th>street_address_r</th>\n",
-       "      <th>street_address_list_l</th>\n",
-       "      <th>street_address_list_r</th>\n",
        "      <th>gamma_street_address</th>\n",
+       "      <th>tf_street_address_l</th>\n",
+       "      <th>tf_street_address_r</th>\n",
        "      <th>bf_street_address</th>\n",
+       "      <th>bf_tf_adj_street_address</th>\n",
        "      <th>state_l</th>\n",
        "      <th>state_r</th>\n",
        "      <th>gamma_state</th>\n",
@@ -2308,199 +1898,204 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>32260</th>\n",
-       "      <td>-24.047823</td>\n",
-       "      <td>5.766122e-08</td>\n",
+       "      <th>295287</th>\n",
+       "      <td>-22.970759</td>\n",
+       "      <td>1.216501e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>82087</td>\n",
-       "      <td>113663</td>\n",
-       "      <td>sutro biopharma</td>\n",
-       "      <td>stirling energy systems solar one</td>\n",
+       "      <td>9829</td>\n",
+       "      <td>3043</td>\n",
+       "      <td>capitol bancorp</td>\n",
+       "      <td>capital power</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>310 utah ave., suite 150</td>\n",
-       "      <td>suite 150</td>\n",
-       "      <td>[310, utah, ave.,, suite, 150]</td>\n",
-       "      <td>[suite, 150]</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.265921</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>az</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>capitol bancorp ctr</td>\n",
+       "      <td>120010423 101 st nw</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.012950</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>south san francisco</td>\n",
-       "      <td>phoenix</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>mi</td>\n",
+       "      <td>ab</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>0.000197</td>\n",
+       "      <td>0.198711</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>lansing</td>\n",
+       "      <td>edmonton</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000293</td>\n",
+       "      <td>0.000428</td>\n",
+       "      <td>0.296590</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>KPTL BNKRP</td>\n",
+       "      <td>KPTL PWR</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.001438</td>\n",
-       "      <td>0.003511</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>STR BFRM</td>\n",
-       "      <td>STRLNK ENRJ SSTMS SLR ON</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>27875</th>\n",
-       "      <td>-24.047823</td>\n",
-       "      <td>5.766122e-08</td>\n",
+       "      <th>383898</th>\n",
+       "      <td>-22.970759</td>\n",
+       "      <td>1.216501e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>126035</td>\n",
-       "      <td>113797</td>\n",
-       "      <td>corner growth acquisition 2</td>\n",
-       "      <td>grubb and ellis management services</td>\n",
+       "      <td>51783</td>\n",
+       "      <td>17550</td>\n",
+       "      <td>state bancorp</td>\n",
+       "      <td>state street bank and trust</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>251 lytton avenue, suite 200</td>\n",
-       "      <td>suite 200</td>\n",
-       "      <td>[251, lytton, avenue,, suite, 200]</td>\n",
-       "      <td>[suite, 200]</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.265921</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>pa</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2 jericho plz</td>\n",
+       "      <td>100 summer st</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.030197</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>palo alto</td>\n",
-       "      <td>pittsburgh</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>ny</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.120228</td>\n",
+       "      <td>0.041765</td>\n",
+       "      <td>0.198711</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>jericho</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000306</td>\n",
+       "      <td>0.014319</td>\n",
+       "      <td>0.296590</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>STT BNKRP</td>\n",
+       "      <td>STT STRT BNK ANT TRST</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.001850</td>\n",
-       "      <td>0.003656</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>KRNR KR0 AKKSXN</td>\n",
-       "      <td>KRB ANT ELS MNJMNT SRFSS</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>27993</th>\n",
-       "      <td>-24.047823</td>\n",
-       "      <td>5.766122e-08</td>\n",
+       "      <th>383897</th>\n",
+       "      <td>-22.970759</td>\n",
+       "      <td>1.216501e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>125096</td>\n",
-       "      <td>97905</td>\n",
-       "      <td>altus power</td>\n",
-       "      <td>allegheny ridge wind farm</td>\n",
+       "      <td>51782</td>\n",
+       "      <td>17550</td>\n",
+       "      <td>state auto financial</td>\n",
+       "      <td>state street bank and trust</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>2200 atlantic street, 6th floor</td>\n",
-       "      <td>6th floor</td>\n",
-       "      <td>[2200, atlantic, street,, 6th, floor]</td>\n",
-       "      <td>[6th, floor]</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.265921</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>ca</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>518 east broad st</td>\n",
+       "      <td>100 summer st</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.020325</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>san francisco</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>oh</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.016991</td>\n",
+       "      <td>0.041765</td>\n",
+       "      <td>0.198711</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>columbus</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002788</td>\n",
+       "      <td>0.014319</td>\n",
+       "      <td>0.296590</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>STT AT FNNXL</td>\n",
+       "      <td>STT STRT BNK ANT TRST</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.003789</td>\n",
-       "      <td>0.013374</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>ALTS PWR</td>\n",
-       "      <td>ALKHN RJ WNT FRM</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>28003</th>\n",
-       "      <td>-24.047823</td>\n",
-       "      <td>5.766122e-08</td>\n",
+       "      <th>383896</th>\n",
+       "      <td>-22.970759</td>\n",
+       "      <td>1.216501e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>115402</td>\n",
-       "      <td>91508</td>\n",
-       "      <td>clearway energy</td>\n",
-       "      <td>clipper windpower</td>\n",
+       "      <td>51781</td>\n",
+       "      <td>17550</td>\n",
+       "      <td>state auto financial</td>\n",
+       "      <td>state street bank and trust</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>300 carnegie center, suite 300</td>\n",
-       "      <td>suite 300</td>\n",
-       "      <td>[300, carnegie, center,, suite, 300]</td>\n",
-       "      <td>[suite, 300]</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.265921</td>\n",
-       "      <td>nj</td>\n",
-       "      <td>ca</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>518 e broad st</td>\n",
+       "      <td>100 summer st</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.031159</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>princeton</td>\n",
-       "      <td>carpinteria</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>oh</td>\n",
+       "      <td>ma</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.016991</td>\n",
+       "      <td>0.041765</td>\n",
+       "      <td>0.198711</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>columbus</td>\n",
+       "      <td>boston</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002788</td>\n",
+       "      <td>0.014319</td>\n",
+       "      <td>0.296590</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>STT AT FNNXL</td>\n",
+       "      <td>STT STRT BNK ANT TRST</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.002118</td>\n",
-       "      <td>0.000189</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>KLRW ENRJ</td>\n",
-       "      <td>KLPR WNTPWR</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>28024</th>\n",
-       "      <td>-24.047823</td>\n",
-       "      <td>5.766122e-08</td>\n",
+       "      <th>383895</th>\n",
+       "      <td>-22.970759</td>\n",
+       "      <td>1.216501e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>125009</td>\n",
-       "      <td>77758</td>\n",
-       "      <td>benchmark 2020 b21 mortgage trust</td>\n",
-       "      <td>bountiful city city of</td>\n",
+       "      <td>51780</td>\n",
+       "      <td>3805</td>\n",
+       "      <td>starz</td>\n",
+       "      <td>citrus world</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000048</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>200 west street</td>\n",
-       "      <td>198 south 200 west street</td>\n",
-       "      <td>[200, west, street]</td>\n",
-       "      <td>[198, south, 200, west, street]</td>\n",
-       "      <td>0.0</td>\n",
-       "      <td>0.265921</td>\n",
-       "      <td>ny</td>\n",
-       "      <td>ut</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>8900 liberty cir</td>\n",
+       "      <td>20205 hwy 2720205 hwy 27</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.113010</td>\n",
-       "      <td>0.010475</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>new york</td>\n",
-       "      <td>bountiful city</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>co</td>\n",
+       "      <td>fl</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.023802</td>\n",
+       "      <td>0.048477</td>\n",
+       "      <td>0.198711</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>englewood</td>\n",
+       "      <td>lake wales</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.002947</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.296590</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>STRS</td>\n",
+       "      <td>STRS WRLT</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.086944</td>\n",
-       "      <td>0.000022</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>BNXMRK B MRTKJ TRST</td>\n",
-       "      <td>BNTFL ST ST OF</td>\n",
-       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -2540,225 +2135,231 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1038434</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>186872</th>\n",
+       "      <td>27.519625</td>\n",
+       "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>137784</td>\n",
-       "      <td>70294</td>\n",
-       "      <td>farmer brothers</td>\n",
-       "      <td>farmers electric ia</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>20333 s normandie ave</td>\n",
-       "      <td>1959 yoder ave,sw</td>\n",
-       "      <td>[20333, s, normandie, ave]</td>\n",
-       "      <td>[1959, yoder, ave,sw]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>ia</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.016527</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>torrance</td>\n",
-       "      <td>kalona</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.002485</td>\n",
-       "      <td>0.000011</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>FRMR BR0RS</td>\n",
-       "      <td>FRMRS ELKTRK I</td>\n",
+       "      <td>39816</td>\n",
+       "      <td>13109</td>\n",
+       "      <td>northwestern public service</td>\n",
+       "      <td>northwestern public service</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.010580</td>\n",
+       "      <td>33 third st se</td>\n",
+       "      <td>33 third st se</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.317122</td>\n",
+       "      <td>sd</td>\n",
+       "      <td>sd</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001930</td>\n",
+       "      <td>0.001930</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>26.483035</td>\n",
+       "      <td>huron</td>\n",
+       "      <td>huron</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>86.293486</td>\n",
+       "      <td>NR0WSTRN PBLK SRFS</td>\n",
+       "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1038441</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>580681</th>\n",
+       "      <td>27.526533</td>\n",
+       "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>139631</td>\n",
-       "      <td>137540</td>\n",
-       "      <td>international game technology</td>\n",
-       "      <td>intergen north america</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000048</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>6355 south buffalo drive</td>\n",
-       "      <td>4th floor</td>\n",
-       "      <td>[6355, south, buffalo, drive]</td>\n",
-       "      <td>[4th, floor]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>nv</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.019288</td>\n",
-       "      <td>0.041401</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>las vegas</td>\n",
-       "      <td>burlington</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.010477</td>\n",
-       "      <td>0.001415</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>INTRNXNL KM TXNLJ</td>\n",
-       "      <td>INTRJN NR0 AMRK</td>\n",
+       "      <td>24650</td>\n",
+       "      <td>8047</td>\n",
+       "      <td>green mountain power</td>\n",
+       "      <td>green mountain power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.021160</td>\n",
+       "      <td>163 acorn ln</td>\n",
+       "      <td>163 acorn ln</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.317122</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>33.262692</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000183</td>\n",
+       "      <td>0.000183</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>34.517394</td>\n",
+       "      <td>KRN MNTN PWR</td>\n",
+       "      <td>KRN MNTN PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1038443</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>438193</th>\n",
+       "      <td>27.757357</td>\n",
+       "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>90853</td>\n",
-       "      <td>13424</td>\n",
-       "      <td>monster arts</td>\n",
-       "      <td>minnesota solar csg 4</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>806 east avenida pico</td>\n",
-       "      <td>200 wellington street west, su</td>\n",
-       "      <td>[806, east, avenida, pico]</td>\n",
-       "      <td>[200, wellington, street, west,, su]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>None</td>\n",
-       "      <td>-1</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>san clemente</td>\n",
-       "      <td>toronto</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000346</td>\n",
-       "      <td>0.002129</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>MNSTR ARTS</td>\n",
-       "      <td>MNST SLR KSK</td>\n",
+       "      <td>58842</td>\n",
+       "      <td>19906</td>\n",
+       "      <td>wausau paper mills</td>\n",
+       "      <td>wausau paper mills</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.031739</td>\n",
+       "      <td>one clarks is</td>\n",
+       "      <td>one clarks is</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.475683</td>\n",
+       "      <td>wi</td>\n",
+       "      <td>wi</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.008840</td>\n",
+       "      <td>0.008840</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>5.782805</td>\n",
+       "      <td>wausau</td>\n",
+       "      <td>wausau</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>103.552183</td>\n",
+       "      <td>WS PPR MLS</td>\n",
+       "      <td>WS PPR MLS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1038454</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>385934</th>\n",
+       "      <td>27.884385</td>\n",
+       "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>108136</td>\n",
-       "      <td>1959</td>\n",
-       "      <td>nxt id</td>\n",
-       "      <td>nextgrid mastic</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>4 research drive, #402</td>\n",
-       "      <td>879 sanchez street</td>\n",
-       "      <td>[4, research, drive,, #402]</td>\n",
-       "      <td>[879, sanchez, street]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.020325</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>shelton</td>\n",
-       "      <td>san francisco</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000390</td>\n",
-       "      <td>0.013374</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>NKST IT</td>\n",
-       "      <td>NKSTKRT MSTK</td>\n",
+       "      <td>51567</td>\n",
+       "      <td>17450</td>\n",
+       "      <td>st joseph light and power</td>\n",
+       "      <td>st joseph light and power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.031739</td>\n",
+       "      <td>520 francis st</td>\n",
+       "      <td>520 francis st</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.475683</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.010118</td>\n",
+       "      <td>0.010118</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>5.052049</td>\n",
+       "      <td>st joseph</td>\n",
+       "      <td>st joseph</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>129.440229</td>\n",
+       "      <td>ST JSF LT ANT PWR</td>\n",
+       "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1038456</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>503816</th>\n",
+       "      <td>29.211031</td>\n",
+       "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>91657</td>\n",
-       "      <td>105602</td>\n",
-       "      <td>coronado biosciences</td>\n",
-       "      <td>garnet energy</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.985981</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>24 new england executive park</td>\n",
-       "      <td>suite 102</td>\n",
-       "      <td>[24, new, england, executive, park]</td>\n",
-       "      <td>[suite, 102]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>ma</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.041401</td>\n",
-       "      <td>0.149142</td>\n",
-       "      <td>0.310698</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>burlington</td>\n",
-       "      <td>westlake village</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.001415</td>\n",
-       "      <td>0.000691</td>\n",
-       "      <td>0.398403</td>\n",
-       "      <td>1.0</td>\n",
-       "      <td>KRNT BSSNSS</td>\n",
-       "      <td>KRNT ENRJ</td>\n",
+       "      <td>20588</td>\n",
+       "      <td>6741</td>\n",
+       "      <td>fibermark</td>\n",
+       "      <td>fibermark</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.021160</td>\n",
+       "      <td>161 wellington rd</td>\n",
+       "      <td>161 wellington rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.475683</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>33.262692</td>\n",
+       "      <td>brattleboro</td>\n",
+       "      <td>brattleboro</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000086</td>\n",
+       "      <td>0.000086</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>73.965845</td>\n",
+       "      <td>FBRMRK</td>\n",
+       "      <td>FBRMRK</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>1038457 rows × 36 columns</p>\n",
+       "<p>590575 rows × 37 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "         match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r            company_name_no_legal_l              company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal                 street_address_l                street_address_r                  street_address_list_l                 street_address_list_r  gamma_street_address  bf_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r  bf_state  bf_tf_adj_state               city_l            city_r  gamma_city  tf_city_l  tf_city_r   bf_city  bf_tf_adj_city company_name_mphone_l     company_name_mphone_r match_key\n",
-       "32260      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1        82087       113663                    sutro biopharma    stirling energy systems solar one                            0                    0.000019                    0.000029                  0.985981                              1.0         310 utah ave., suite 150                       suite 150         [310, utah, ave.,, suite, 150]                          [suite, 150]                   0.0           0.265921      ca      az            0    0.149142    0.012950  0.310698              1.0  south san francisco           phoenix           0   0.001438   0.003511  0.398403             1.0              STR BFRM  STRLNK ENRJ SSTMS SLR ON         3\n",
-       "27875      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       126035       113797        corner growth acquisition 2  grubb and ellis management services                            0                    0.000010                    0.000019                  0.985981                              1.0     251 lytton avenue, suite 200                       suite 200     [251, lytton, avenue,, suite, 200]                          [suite, 200]                   0.0           0.265921      ca      pa            0    0.149142    0.030197  0.310698              1.0            palo alto        pittsburgh           0   0.001850   0.003656  0.398403             1.0      KRNR KR0 AKKSXN   KRB ANT ELS MNJMNT SRFSS         3\n",
-       "27993      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       125096        97905                        altus power            allegheny ridge wind farm                            0                    0.000010                    0.000038                  0.985981                              1.0  2200 atlantic street, 6th floor                       6th floor  [2200, atlantic, street,, 6th, floor]                          [6th, floor]                   0.0           0.265921      ct      ca            0    0.020325    0.149142  0.310698              1.0             stamford     san francisco           0   0.003789   0.013374  0.398403             1.0              ALTS PWR          ALKHN RJ WNT FRM         3\n",
-       "28003      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       115402        91508                    clearway energy                    clipper windpower                            0                    0.000038                    0.000029                  0.985981                              1.0   300 carnegie center, suite 300                       suite 300   [300, carnegie, center,, suite, 300]                          [suite, 300]                   0.0           0.265921      nj      ca            0    0.031159    0.149142  0.310698              1.0            princeton       carpinteria           0   0.002118   0.000189  0.398403             1.0             KLRW ENRJ               KLPR WNTPWR         3\n",
-       "28024      -24.047823       5.766122e-08  __splink__input_table_0  __splink__input_table_1       125009        77758  benchmark 2020 b21 mortgage trust               bountiful city city of                            0                    0.000010                    0.000048                  0.985981                              1.0                  200 west street       198 south 200 west street                    [200, west, street]       [198, south, 200, west, street]                   0.0           0.265921      ny      ut            0    0.113010    0.010475  0.310698              1.0             new york    bountiful city           0   0.086944   0.000022  0.398403             1.0   BNXMRK B MRTKJ TRST            BNTFL ST ST OF         3\n",
-       "...               ...                ...                      ...                      ...          ...          ...                                ...                                  ...                          ...                         ...                         ...                       ...                              ...                              ...                             ...                                    ...                                   ...                   ...                ...     ...     ...          ...         ...         ...       ...              ...                  ...               ...         ...        ...        ...       ...             ...                   ...                       ...       ...\n",
-       "1038434           NaN                NaN  __splink__input_table_0  __splink__input_table_1       137784        70294                    farmer brothers                  farmers electric ia                            0                    0.000029                    0.000038                  0.985981                              1.0            20333 s normandie ave               1959 yoder ave,sw             [20333, s, normandie, ave]                 [1959, yoder, ave,sw]                   NaN                NaN      ca      ia            0    0.149142    0.016527  0.310698              1.0             torrance            kalona           0   0.002485   0.000011  0.398403             1.0            FRMR BR0RS            FRMRS ELKTRK I         0\n",
-       "1038441           NaN                NaN  __splink__input_table_0  __splink__input_table_1       139631       137540      international game technology               intergen north america                            0                    0.000048                    0.000029                  0.985981                              1.0         6355 south buffalo drive                       4th floor          [6355, south, buffalo, drive]                          [4th, floor]                   NaN                NaN      nv      ma            0    0.019288    0.041401  0.310698              1.0            las vegas        burlington           0   0.010477   0.001415  0.398403             1.0     INTRNXNL KM TXNLJ           INTRJN NR0 AMRK         0\n",
-       "1038443           NaN                NaN  __splink__input_table_0  __splink__input_table_1        90853        13424                       monster arts                minnesota solar csg 4                            0                    0.000010                    0.000029                  0.985981                              1.0            806 east avenida pico  200 wellington street west, su             [806, east, avenida, pico]  [200, wellington, street, west,, su]                   NaN                NaN      ca    None           -1    0.149142         NaN  1.000000              1.0         san clemente           toronto           0   0.000346   0.002129  0.398403             1.0            MNSTR ARTS             MNST SLR KSK          0\n",
-       "1038454           NaN                NaN  __splink__input_table_0  __splink__input_table_1       108136         1959                             nxt id                      nextgrid mastic                            0                    0.000038                    0.000029                  0.985981                              1.0           4 research drive, #402              879 sanchez street            [4, research, drive,, #402]                [879, sanchez, street]                   NaN                NaN      ct      ca            0    0.020325    0.149142  0.310698              1.0              shelton     san francisco           0   0.000390   0.013374  0.398403             1.0               NKST IT              NKSTKRT MSTK         0\n",
-       "1038456           NaN                NaN  __splink__input_table_0  __splink__input_table_1        91657       105602               coronado biosciences                        garnet energy                            0                    0.000019                    0.000038                  0.985981                              1.0    24 new england executive park                       suite 102    [24, new, england, executive, park]                          [suite, 102]                   NaN                NaN      ma      ca            0    0.041401    0.149142  0.310698              1.0           burlington  westlake village           0   0.001415   0.000691  0.398403             1.0           KRNT BSSNSS                 KRNT ENRJ         0\n",
-       "\n",
-       "[1038457 rows x 36 columns]"
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r      company_name_no_legal_l      company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal     street_address_l          street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l  company_name_mphone_r match_key\n",
+       "295287    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1         9829         3043              capitol bancorp                capital power                            0                    0.000024                    0.000012                  0.986045                         1.000000  capitol bancorp ctr       120010423 101 st nw                     0             0.000012             0.000110           0.881657                  1.000000      mi      ab            0    0.015147    0.000197   0.198711         1.000000      lansing     edmonton           0   0.000293   0.000428    0.296590        1.000000            KPTL BNKRP               KPTL PWR         0\n",
+       "383898    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51783        17550                state bancorp  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000        2 jericho plz             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      ny      ma            0    0.120228    0.041765   0.198711         1.000000      jericho       boston           0   0.000306   0.014319    0.296590        1.000000             STT BNKRP  STT STRT BNK ANT TRST         0\n",
+       "383897    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51782        17550         state auto financial  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000    518 east broad st             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      oh      ma            0    0.016991    0.041765   0.198711         1.000000     columbus       boston           0   0.002788   0.014319    0.296590        1.000000          STT AT FNNXL  STT STRT BNK ANT TRST         0\n",
+       "383896    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51781        17550         state auto financial  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000       518 e broad st             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      oh      ma            0    0.016991    0.041765   0.198711         1.000000     columbus       boston           0   0.002788   0.014319    0.296590        1.000000          STT AT FNNXL  STT STRT BNK ANT TRST         0\n",
+       "383895    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51780         3805                        starz                 citrus world                            0                    0.000024                    0.000049                  0.986045                         1.000000     8900 liberty cir  20205 hwy 2720205 hwy 27                     0             0.000024             0.000012           0.881657                  1.000000      co      fl            0    0.023802    0.048477   0.198711         1.000000    englewood   lake wales           0   0.002947   0.000049    0.296590        1.000000                  STRS              STRS WRLT         0\n",
+       "...              ...                ...                      ...                      ...          ...          ...                          ...                          ...                          ...                         ...                         ...                       ...                              ...                  ...                       ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...         ...             ...                   ...                    ...       ...\n",
+       "186872     27.519625       1.000000e+00  __splink__input_table_0  __splink__input_table_1        39816        13109  northwestern public service  northwestern public service                            2                    0.000073                    0.000073             652179.111493                         0.010580       33 third st se            33 third st se                     2             0.000037             0.000037        9450.378101                  0.317122      sd      sd            1    0.001930    0.001930  15.873789        26.483035        huron        huron           2   0.000073   0.000073  108.031428       86.293486    NR0WSTRN PBLK SRFS     NR0WSTRN PBLK SRFS         0\n",
+       "580681     27.526533       1.000000e+00  __splink__input_table_0  __splink__input_table_1        24650         8047         green mountain power         green mountain power                            2                    0.000037                    0.000037             652179.111493                         0.021160         163 acorn ln              163 acorn ln                     2             0.000037             0.000037        9450.378101                  0.317122      vt      vt            1    0.001537    0.001537  15.873789        33.262692   colchester   colchester           2   0.000183   0.000183  108.031428       34.517394          KRN MNTN PWR           KRN MNTN PWR         0\n",
+       "438193     27.757357       1.000000e+00  __splink__input_table_0  __splink__input_table_1        58842        19906           wausau paper mills           wausau paper mills                            2                    0.000024                    0.000024             652179.111493                         0.031739        one clarks is             one clarks is                     2             0.000024             0.000024        9450.378101                  0.475683      wi      wi            1    0.008840    0.008840  15.873789         5.782805       wausau       wausau           2   0.000061   0.000061  108.031428      103.552183            WS PPR MLS             WS PPR MLS         0\n",
+       "385934     27.884385       1.000000e+00  __splink__input_table_0  __splink__input_table_1        51567        17450    st joseph light and power    st joseph light and power                            2                    0.000024                    0.000024             652179.111493                         0.031739       520 francis st            520 francis st                     2             0.000024             0.000024        9450.378101                  0.475683      mo      mo            1    0.010118    0.010118  15.873789         5.052049    st joseph    st joseph           2   0.000049   0.000049  108.031428      129.440229     ST JSF LT ANT PWR      ST JSF LT ANT PWR         0\n",
+       "503816     29.211031       1.000000e+00  __splink__input_table_0  __splink__input_table_1        20588         6741                    fibermark                    fibermark                            2                    0.000037                    0.000037             652179.111493                         0.021160    161 wellington rd         161 wellington rd                     2             0.000024             0.000024        9450.378101                  0.475683      vt      vt            1    0.001537    0.001537  15.873789        33.262692  brattleboro  brattleboro           2   0.000086   0.000086  108.031428       73.965845                FBRMRK                 FBRMRK         0\n",
+       "\n",
+       "[590575 rows x 37 columns]"
       ]
      },
-     "execution_count": 433,
+     "execution_count": 135,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2769,13 +2370,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 434,
+   "execution_count": 136,
    "id": "c0b292c8-26ed-407a-866e-75851577d567",
    "metadata": {},
    "outputs": [],
    "source": [
     "# join on utility_id_eia and CIK\n",
-    "preds_validation_df = preds_df.merge(sec_clean_df[[\"record_id\", \"central_index_key\", \"company_name_raw\"]],\n",
+    "preds_validation_df = preds_df.merge(sec_df[[\"record_id\", \"sec_company_id\", \"central_index_key\", \"company_name_raw\"]],\n",
     "                                     how=\"left\",\n",
     "                                     left_on=\"record_id_l\",\n",
     "                                     right_on=\"record_id\")"
@@ -2783,12 +2384,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 435,
+   "execution_count": 137,
    "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "preds_validation_df = preds_validation_df.merge(eia_clean_df[[\"record_id\", \"utility_id_eia\"]],\n",
+    "preds_validation_df = preds_validation_df.merge(eia_df[[\"record_id\", \"utility_id_eia\"]],\n",
     "                                                how=\"left\",\n",
     "                                                left_on=\"record_id_r\",\n",
     "                                                right_on=\"record_id\")"
@@ -2796,19 +2397,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 436,
+   "execution_count": 138,
    "id": "5103190c-3775-427f-a8f2-cc8a8f79892b",
    "metadata": {},
    "outputs": [],
    "source": [
     "preds_validation_df = preds_validation_df.sort_values(\n",
-    "    by=[\"central_index_key\", \"utility_id_eia\", \"match_probability\"], ascending=False\n",
-    ").drop_duplicates(subset=[\"central_index_key\", \"utility_id_eia\"], keep=\"first\")"
+    "    by=[\"sec_company_id\", \"utility_id_eia\", \"match_probability\"], ascending=False\n",
+    ").drop_duplicates(subset=[\"sec_company_id\", \"utility_id_eia\"], keep=\"first\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 437,
+   "execution_count": 139,
    "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8",
    "metadata": {},
    "outputs": [
@@ -2848,10 +2449,11 @@
        "      <th>bf_tf_adj_company_name_no_legal</th>\n",
        "      <th>street_address_l</th>\n",
        "      <th>street_address_r</th>\n",
-       "      <th>street_address_list_l</th>\n",
-       "      <th>street_address_list_r</th>\n",
        "      <th>gamma_street_address</th>\n",
+       "      <th>tf_street_address_l</th>\n",
+       "      <th>tf_street_address_r</th>\n",
        "      <th>bf_street_address</th>\n",
+       "      <th>bf_tf_adj_street_address</th>\n",
        "      <th>state_l</th>\n",
        "      <th>state_r</th>\n",
        "      <th>gamma_state</th>\n",
@@ -2870,6 +2472,7 @@
        "      <th>company_name_mphone_r</th>\n",
        "      <th>match_key</th>\n",
        "      <th>record_id_x</th>\n",
+       "      <th>sec_company_id</th>\n",
        "      <th>central_index_key</th>\n",
        "      <th>company_name_raw</th>\n",
        "      <th>record_id_y</th>\n",
@@ -2878,224 +2481,234 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>889845</th>\n",
-       "      <td>5.679807</td>\n",
-       "      <td>0.980865</td>\n",
+       "      <th>466134</th>\n",
+       "      <td>3.824596</td>\n",
+       "      <td>0.934073</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>51956</td>\n",
-       "      <td>22658</td>\n",
-       "      <td>constellation energy</td>\n",
-       "      <td>constellation newenergy</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.000077</td>\n",
-       "      <td>6085.754919</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1310 point street</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[1310, point, street]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>-1.0</td>\n",
+       "      <td>14692</td>\n",
+       "      <td>6293</td>\n",
+       "      <td>crane</td>\n",
+       "      <td>entergy nuclear power marketing</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986045</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>md</td>\n",
-       "      <td>md</td>\n",
+       "      <td>100 first stamford pl</td>\n",
+       "      <td>100 first stamford pl</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.095137</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>ct</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.023298</td>\n",
-       "      <td>0.023298</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>2.034020</td>\n",
-       "      <td>baltimore</td>\n",
-       "      <td>baltimore</td>\n",
+       "      <td>0.020876</td>\n",
+       "      <td>0.020876</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>2.448667</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>stamford</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.003678</td>\n",
-       "      <td>0.003678</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>1.654881</td>\n",
-       "      <td>KNSTLXN ENRJ</td>\n",
-       "      <td>KNSTLXN NWNRJ</td>\n",
-       "      <td>0</td>\n",
-       "      <td>51956</td>\n",
-       "      <td>0001868275</td>\n",
-       "      <td>constellation energy corp</td>\n",
-       "      <td>22658</td>\n",
-       "      <td>58491</td>\n",
+       "      <td>0.003950</td>\n",
+       "      <td>0.003950</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>1.602975</td>\n",
+       "      <td>KRN</td>\n",
+       "      <td>ENTRJ NKLR PWR MRKTNK</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14692</td>\n",
+       "      <td>0001944013</td>\n",
+       "      <td>0001944013</td>\n",
+       "      <td>crane co</td>\n",
+       "      <td>6293</td>\n",
+       "      <td>55243</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>884109</th>\n",
-       "      <td>13.095633</td>\n",
-       "      <td>0.999886</td>\n",
+       "      <th>466594</th>\n",
+       "      <td>4.620005</td>\n",
+       "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>120267</td>\n",
-       "      <td>96849</td>\n",
-       "      <td>evergy</td>\n",
-       "      <td>evergy</td>\n",
+       "      <td>17752</td>\n",
+       "      <td>5535</td>\n",
+       "      <td>dte electric securitization funding i</td>\n",
+       "      <td>dte sustainable generation</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>one energy plz</td>\n",
+       "      <td>one energy plz</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.059564</td>\n",
-       "      <td>1200 main street</td>\n",
-       "      <td>1200 main street</td>\n",
-       "      <td>[1200, main, street]</td>\n",
-       "      <td>[1200, main, street]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>mo</td>\n",
-       "      <td>mo</td>\n",
+       "      <td>0.000330</td>\n",
+       "      <td>0.000330</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.035236</td>\n",
+       "      <td>mi</td>\n",
+       "      <td>mi</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.011744</td>\n",
-       "      <td>0.011744</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>4.035057</td>\n",
-       "      <td>kansas city</td>\n",
-       "      <td>kansas city</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>3.374867</td>\n",
+       "      <td>detroit</td>\n",
+       "      <td>detroit</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.001973</td>\n",
-       "      <td>0.001973</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>3.085372</td>\n",
-       "      <td>EFRJ</td>\n",
-       "      <td>EFRJ</td>\n",
-       "      <td>0</td>\n",
-       "      <td>120267</td>\n",
-       "      <td>0001711269</td>\n",
-       "      <td>evergy, inc.</td>\n",
-       "      <td>96849</td>\n",
-       "      <td>64428</td>\n",
+       "      <td>0.001162</td>\n",
+       "      <td>0.001162</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>5.450115</td>\n",
+       "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
+       "      <td>TT SSTNBL JNRXN</td>\n",
+       "      <td>1</td>\n",
+       "      <td>17752</td>\n",
+       "      <td>0001876068</td>\n",
+       "      <td>0001876068</td>\n",
+       "      <td>dte electric securitization funding i llc</td>\n",
+       "      <td>5535</td>\n",
+       "      <td>64331</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>893941</th>\n",
-       "      <td>12.486567</td>\n",
-       "      <td>0.999826</td>\n",
+       "      <th>480747</th>\n",
+       "      <td>4.620005</td>\n",
+       "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>120222</td>\n",
-       "      <td>96211</td>\n",
-       "      <td>consol energy</td>\n",
-       "      <td>consol energy</td>\n",
+       "      <td>17752</td>\n",
+       "      <td>5522</td>\n",
+       "      <td>dte electric securitization funding i</td>\n",
+       "      <td>dte electric</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>one energy plz</td>\n",
+       "      <td>one energy plz</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.019855</td>\n",
-       "      <td>275 technology drive</td>\n",
-       "      <td>275 technology drive</td>\n",
-       "      <td>[275, technology, drive]</td>\n",
-       "      <td>[275, technology, drive]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>pa</td>\n",
+       "      <td>0.000330</td>\n",
+       "      <td>0.000330</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.035236</td>\n",
+       "      <td>mi</td>\n",
+       "      <td>mi</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.030197</td>\n",
-       "      <td>0.030197</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>1.569346</td>\n",
-       "      <td>canonsburg</td>\n",
-       "      <td>canonsburg</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>3.374867</td>\n",
+       "      <td>detroit</td>\n",
+       "      <td>detroit</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000390</td>\n",
-       "      <td>0.000390</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>15.603165</td>\n",
-       "      <td>KNSL ENRJ</td>\n",
-       "      <td>KNSL ENRJ</td>\n",
-       "      <td>0</td>\n",
-       "      <td>120222</td>\n",
-       "      <td>0001710366</td>\n",
-       "      <td>consol energy inc.</td>\n",
-       "      <td>96211</td>\n",
-       "      <td>4299</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>943594</th>\n",
-       "      <td>9.161274</td>\n",
-       "      <td>0.998256</td>\n",
+       "      <td>0.001162</td>\n",
+       "      <td>0.001162</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>5.450115</td>\n",
+       "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
+       "      <td>TT ELKTRK</td>\n",
+       "      <td>0</td>\n",
+       "      <td>17752</td>\n",
+       "      <td>0001876068</td>\n",
+       "      <td>0001876068</td>\n",
+       "      <td>dte electric securitization funding i llc</td>\n",
+       "      <td>5522</td>\n",
+       "      <td>5109</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>464506</th>\n",
+       "      <td>6.019599</td>\n",
+       "      <td>0.984820</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>119271</td>\n",
-       "      <td>83669</td>\n",
-       "      <td>vistra energy</td>\n",
-       "      <td>vistra energy</td>\n",
+       "      <td>14051</td>\n",
+       "      <td>10935</td>\n",
+       "      <td>constellation energy</td>\n",
+       "      <td>luminace solar rhode island</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1310 pt st</td>\n",
+       "      <td>1310 pt st</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.059564</td>\n",
-       "      <td>6555 sierra drive</td>\n",
-       "      <td>6555 sierra drive</td>\n",
-       "      <td>[6555, sierra, drive]</td>\n",
-       "      <td>[6555, sierra, drive]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>tx</td>\n",
-       "      <td>tx</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.475683</td>\n",
+       "      <td>md</td>\n",
+       "      <td>md</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.080866</td>\n",
-       "      <td>0.080866</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>0.586015</td>\n",
-       "      <td>irving</td>\n",
-       "      <td>irving</td>\n",
+       "      <td>0.025130</td>\n",
+       "      <td>0.025130</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>2.034167</td>\n",
+       "      <td>baltimore</td>\n",
+       "      <td>baltimore</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.004380</td>\n",
-       "      <td>0.004380</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>1.389595</td>\n",
-       "      <td>FSTR ENRJ</td>\n",
-       "      <td>FSTR ENRJ</td>\n",
-       "      <td>0</td>\n",
-       "      <td>119271</td>\n",
-       "      <td>0001692819</td>\n",
-       "      <td>vistra energy corp.</td>\n",
-       "      <td>83669</td>\n",
-       "      <td>62723</td>\n",
+       "      <td>0.003583</td>\n",
+       "      <td>0.003583</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>1.767102</td>\n",
+       "      <td>KNSTLXN ENRJ</td>\n",
+       "      <td>LMNS SLR RHT ISLNT</td>\n",
+       "      <td>1</td>\n",
+       "      <td>14051</td>\n",
+       "      <td>0001868275</td>\n",
+       "      <td>0001868275</td>\n",
+       "      <td>constellation energy corp</td>\n",
+       "      <td>10935</td>\n",
+       "      <td>62679</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>860414</th>\n",
-       "      <td>7.576311</td>\n",
-       "      <td>0.994788</td>\n",
+       "      <th>340973</th>\n",
+       "      <td>6.201744</td>\n",
+       "      <td>0.986596</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>119274</td>\n",
-       "      <td>71441</td>\n",
-       "      <td>vistra</td>\n",
-       "      <td>vistra</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.019855</td>\n",
-       "      <td>6555 sierra drive</td>\n",
-       "      <td>6555 sierra drive</td>\n",
-       "      <td>[6555, sierra, drive]</td>\n",
-       "      <td>[6555, sierra, drive]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>tx</td>\n",
-       "      <td>tx</td>\n",
+       "      <td>14051</td>\n",
+       "      <td>4420</td>\n",
+       "      <td>constellation energy</td>\n",
+       "      <td>constellation newenergy</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>5704.210475</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1310 pt st</td>\n",
+       "      <td>100 constellation way</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000183</td>\n",
+       "      <td>0.881657</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>md</td>\n",
+       "      <td>md</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.080866</td>\n",
-       "      <td>0.080866</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>0.586015</td>\n",
-       "      <td>irving</td>\n",
-       "      <td>irving</td>\n",
+       "      <td>0.025130</td>\n",
+       "      <td>0.025130</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>2.034167</td>\n",
+       "      <td>baltimore</td>\n",
+       "      <td>baltimore</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.004380</td>\n",
-       "      <td>0.004380</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>1.389595</td>\n",
-       "      <td>FSTR</td>\n",
-       "      <td>FSTR</td>\n",
+       "      <td>0.003583</td>\n",
+       "      <td>0.003583</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>1.767102</td>\n",
+       "      <td>KNSTLXN ENRJ</td>\n",
+       "      <td>KNSTLXN NWNRJ</td>\n",
        "      <td>0</td>\n",
-       "      <td>119274</td>\n",
-       "      <td>0001692819</td>\n",
-       "      <td>vistra corp.</td>\n",
-       "      <td>71441</td>\n",
-       "      <td>5504</td>\n",
+       "      <td>14051</td>\n",
+       "      <td>0001868275</td>\n",
+       "      <td>0001868275</td>\n",
+       "      <td>constellation energy corp</td>\n",
+       "      <td>4420</td>\n",
+       "      <td>58491</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>...</th>\n",
@@ -3140,250 +2753,262 @@
        "      <td>...</td>\n",
        "      <td>...</td>\n",
        "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1026765</th>\n",
-       "      <td>12.087133</td>\n",
-       "      <td>0.999770</td>\n",
+       "      <th>464642</th>\n",
+       "      <td>5.308053</td>\n",
+       "      <td>0.975380</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>153106</td>\n",
-       "      <td>79761</td>\n",
-       "      <td>archer daniels midland</td>\n",
-       "      <td>archer daniels midland</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>6561</td>\n",
+       "      <td>air products and chemicals /de/</td>\n",
+       "      <td>exelon gen extexlaporte</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986045</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.019855</td>\n",
-       "      <td>4666 faries pkwy</td>\n",
-       "      <td>4666 faries pkwy</td>\n",
-       "      <td>[4666, faries, pkwy]</td>\n",
-       "      <td>[4666, faries, pkwy]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>il</td>\n",
-       "      <td>il</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.095137</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>pa</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.033191</td>\n",
-       "      <td>0.033191</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>1.427770</td>\n",
-       "      <td>decatur</td>\n",
-       "      <td>decatur</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>1.738226</td>\n",
+       "      <td>allentown</td>\n",
+       "      <td>allentown</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000468</td>\n",
-       "      <td>0.000468</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>13.002638</td>\n",
-       "      <td>ARXR TNLS MTLNT</td>\n",
-       "      <td>ARXR TNLS MTLNT</td>\n",
-       "      <td>0</td>\n",
-       "      <td>153106</td>\n",
-       "      <td>0000007084</td>\n",
-       "      <td>archer daniels midland co</td>\n",
-       "      <td>79761</td>\n",
-       "      <td>772</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>6.314158</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
+       "      <td>EKSLN JN EKSTKSLPRT</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>air products &amp; chemicals inc /de/</td>\n",
+       "      <td>6561</td>\n",
+       "      <td>6081</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>656833</th>\n",
-       "      <td>9.809977</td>\n",
-       "      <td>0.998887</td>\n",
+       "      <th>227094</th>\n",
+       "      <td>20.402617</td>\n",
+       "      <td>0.999999</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>150546</td>\n",
-       "      <td>79913</td>\n",
-       "      <td>appalachian power</td>\n",
-       "      <td>appalachian power</td>\n",
+       "      <td>1586</td>\n",
+       "      <td>430</td>\n",
+       "      <td>air products and chemicals</td>\n",
+       "      <td>air products and chemicals</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000077</td>\n",
-       "      <td>0.000077</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.014891</td>\n",
-       "      <td>1 riverside plaza</td>\n",
-       "      <td>1 riverside plaza</td>\n",
-       "      <td>[1, riverside, plaza]</td>\n",
-       "      <td>[1, riverside, plaza]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
-       "      <td>oh</td>\n",
-       "      <td>oh</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.018770</td>\n",
-       "      <td>0.018770</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>2.524754</td>\n",
-       "      <td>columbus</td>\n",
-       "      <td>columbus</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.021160</td>\n",
+       "      <td>1940 air products blvd</td>\n",
+       "      <td>1940 air products blvd</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.003009</td>\n",
-       "      <td>0.003009</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>2.022633</td>\n",
-       "      <td>APLXN PWR</td>\n",
-       "      <td>APLXN PWR</td>\n",
-       "      <td>0</td>\n",
-       "      <td>150546</td>\n",
-       "      <td>0000006879</td>\n",
-       "      <td>appalachian power co</td>\n",
-       "      <td>79913</td>\n",
-       "      <td>733</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>640747</th>\n",
-       "      <td>10.888046</td>\n",
-       "      <td>0.999473</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>144743</td>\n",
-       "      <td>80319</td>\n",
-       "      <td>american crystal sugar /mn/</td>\n",
-       "      <td>american crystal sugar</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>6085.754919</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>101 n 3rd st</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[101, n, 3rd, st]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>-1.0</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>mn</td>\n",
-       "      <td>mn</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.237842</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>pa</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.025996</td>\n",
-       "      <td>0.025996</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>1.822919</td>\n",
-       "      <td>moorhead</td>\n",
-       "      <td>moorhead</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>1.738226</td>\n",
+       "      <td>allentown</td>\n",
+       "      <td>allentown</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.000089</td>\n",
-       "      <td>0.000089</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>68.263848</td>\n",
-       "      <td>AMRKN KRSTL SKR MN</td>\n",
-       "      <td>AMRKN KRSTL SKR</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>6.314158</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS</td>\n",
        "      <td>0</td>\n",
-       "      <td>144743</td>\n",
-       "      <td>0000004828</td>\n",
-       "      <td>american crystal sugar co /mn/</td>\n",
-       "      <td>80319</td>\n",
-       "      <td>491</td>\n",
+       "      <td>1586</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>air products &amp; chemicals, inc.</td>\n",
+       "      <td>430</td>\n",
+       "      <td>991</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>998578</th>\n",
-       "      <td>9.990554</td>\n",
-       "      <td>0.999018</td>\n",
+       "      <th>224504</th>\n",
+       "      <td>5.308053</td>\n",
+       "      <td>0.975380</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>2575</td>\n",
-       "      <td>80977</td>\n",
-       "      <td>alabama power</td>\n",
-       "      <td>alabama power</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000067</td>\n",
-       "      <td>0.000067</td>\n",
-       "      <td>872345.689655</td>\n",
-       "      <td>0.017018</td>\n",
-       "      <td>600 n 18th st</td>\n",
-       "      <td>None</td>\n",
-       "      <td>[600, n, 18th, st]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>-1.0</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>435</td>\n",
+       "      <td>air products and chemicals /de/</td>\n",
+       "      <td>air products</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.986045</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>al</td>\n",
-       "      <td>al</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
+       "      <td>7201 hamilton blvd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.095137</td>\n",
+       "      <td>pa</td>\n",
+       "      <td>pa</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.005280</td>\n",
-       "      <td>0.005280</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>8.975778</td>\n",
-       "      <td>birmingham</td>\n",
-       "      <td>birmingham</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>1.738226</td>\n",
+       "      <td>allentown</td>\n",
+       "      <td>allentown</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.001995</td>\n",
-       "      <td>0.001995</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>3.050898</td>\n",
-       "      <td>ALBM PWR</td>\n",
-       "      <td>ALBM PWR</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>6.314158</td>\n",
+       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
+       "      <td>AR PRTKTS</td>\n",
        "      <td>0</td>\n",
-       "      <td>2575</td>\n",
-       "      <td>0000003153</td>\n",
-       "      <td>alabama power co</td>\n",
-       "      <td>80977</td>\n",
-       "      <td>195</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>0000002969</td>\n",
+       "      <td>air products &amp; chemicals inc /de/</td>\n",
+       "      <td>435</td>\n",
+       "      <td>980</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>912914</th>\n",
-       "      <td>9.434494</td>\n",
-       "      <td>0.998557</td>\n",
+       "      <th>225982</th>\n",
+       "      <td>5.308053</td>\n",
+       "      <td>0.975380</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>132976</td>\n",
-       "      <td>79317</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>432</td>\n",
        "      <td>air products and chemicals /de/</td>\n",
-       "      <td>air products and chemicals</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000048</td>\n",
-       "      <td>6085.754919</td>\n",
+       "      <td>air products energy enterprises</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986045</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>7201 hamilton blvd</td>\n",
        "      <td>7201 hamilton blvd</td>\n",
-       "      <td>[7201, hamilton, blvd]</td>\n",
-       "      <td>[7201, hamilton, blvd]</td>\n",
-       "      <td>2.0</td>\n",
-       "      <td>5.407499</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>0.000122</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.095137</td>\n",
        "      <td>pa</td>\n",
        "      <td>pa</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.030197</td>\n",
-       "      <td>0.030197</td>\n",
-       "      <td>14.856341</td>\n",
-       "      <td>1.569346</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>0.029409</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>1.738226</td>\n",
        "      <td>allentown</td>\n",
        "      <td>allentown</td>\n",
        "      <td>2</td>\n",
-       "      <td>0.001137</td>\n",
-       "      <td>0.001137</td>\n",
-       "      <td>94.80739</td>\n",
-       "      <td>5.354027</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>0.001003</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>6.314158</td>\n",
        "      <td>AR PRTKTS ANT XMKLS T</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS</td>\n",
+       "      <td>AR PRTKTS ENRJ ENTRPRSS</td>\n",
        "      <td>0</td>\n",
-       "      <td>132976</td>\n",
+       "      <td>1585</td>\n",
+       "      <td>0000002969</td>\n",
        "      <td>0000002969</td>\n",
        "      <td>air products &amp; chemicals inc /de/</td>\n",
-       "      <td>79317</td>\n",
-       "      <td>991</td>\n",
+       "      <td>432</td>\n",
+       "      <td>353</td>\n",
        "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>197 rows × 41 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r          company_name_no_legal_l     company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal      street_address_l      street_address_r     street_address_list_l     street_address_list_r  gamma_street_address  bf_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r   bf_city  bf_tf_adj_city  company_name_mphone_l company_name_mphone_r match_key  record_id_x central_index_key                   company_name_raw  record_id_y  utility_id_eia\n",
-       "889845       5.679807           0.980865  __splink__input_table_0  __splink__input_table_1        51956        22658             constellation energy     constellation newenergy                            1                    0.000029                    0.000077               6085.754919                         1.000000     1310 point street                  None     [1310, point, street]                       NaN                  -1.0           1.000000      md      md            1    0.023298    0.023298  14.856341         2.034020    baltimore    baltimore           2   0.003678   0.003678  94.80739        1.654881           KNSTLXN ENRJ         KNSTLXN NWNRJ         0        51956        0001868275          constellation energy corp        22658           58491\n",
-       "884109      13.095633           0.999886  __splink__input_table_0  __splink__input_table_1       120267        96849                           evergy                      evergy                            2                    0.000019                    0.000019             872345.689655                         0.059564      1200 main street      1200 main street      [1200, main, street]      [1200, main, street]                   2.0           5.407499      mo      mo            1    0.011744    0.011744  14.856341         4.035057  kansas city  kansas city           2   0.001973   0.001973  94.80739        3.085372                   EFRJ                  EFRJ         0       120267        0001711269                       evergy, inc.        96849           64428\n",
-       "893941      12.486567           0.999826  __splink__input_table_0  __splink__input_table_1       120222        96211                    consol energy               consol energy                            2                    0.000058                    0.000058             872345.689655                         0.019855  275 technology drive  275 technology drive  [275, technology, drive]  [275, technology, drive]                   2.0           5.407499      pa      pa            1    0.030197    0.030197  14.856341         1.569346   canonsburg   canonsburg           2   0.000390   0.000390  94.80739       15.603165              KNSL ENRJ             KNSL ENRJ         0       120222        0001710366                 consol energy inc.        96211            4299\n",
-       "943594       9.161274           0.998256  __splink__input_table_0  __splink__input_table_1       119271        83669                    vistra energy               vistra energy                            2                    0.000019                    0.000019             872345.689655                         0.059564     6555 sierra drive     6555 sierra drive     [6555, sierra, drive]     [6555, sierra, drive]                   2.0           5.407499      tx      tx            1    0.080866    0.080866  14.856341         0.586015       irving       irving           2   0.004380   0.004380  94.80739        1.389595              FSTR ENRJ             FSTR ENRJ         0       119271        0001692819                vistra energy corp.        83669           62723\n",
-       "860414       7.576311           0.994788  __splink__input_table_0  __splink__input_table_1       119274        71441                           vistra                      vistra                            2                    0.000058                    0.000058             872345.689655                         0.019855     6555 sierra drive     6555 sierra drive     [6555, sierra, drive]     [6555, sierra, drive]                   2.0           5.407499      tx      tx            1    0.080866    0.080866  14.856341         0.586015       irving       irving           2   0.004380   0.004380  94.80739        1.389595                   FSTR                  FSTR         0       119274        0001692819                       vistra corp.        71441            5504\n",
-       "...               ...                ...                      ...                      ...          ...          ...                              ...                         ...                          ...                         ...                         ...                       ...                              ...                   ...                   ...                       ...                       ...                   ...                ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...       ...             ...                    ...                   ...       ...          ...               ...                                ...          ...             ...\n",
-       "1026765     12.087133           0.999770  __splink__input_table_0  __splink__input_table_1       153106        79761           archer daniels midland      archer daniels midland                            2                    0.000058                    0.000058             872345.689655                         0.019855      4666 faries pkwy      4666 faries pkwy      [4666, faries, pkwy]      [4666, faries, pkwy]                   2.0           5.407499      il      il            1    0.033191    0.033191  14.856341         1.427770      decatur      decatur           2   0.000468   0.000468  94.80739       13.002638        ARXR TNLS MTLNT       ARXR TNLS MTLNT         0       153106        0000007084          archer daniels midland co        79761             772\n",
-       "656833       9.809977           0.998887  __splink__input_table_0  __splink__input_table_1       150546        79913                appalachian power           appalachian power                            2                    0.000077                    0.000077             872345.689655                         0.014891     1 riverside plaza     1 riverside plaza     [1, riverside, plaza]     [1, riverside, plaza]                   2.0           5.407499      oh      oh            1    0.018770    0.018770  14.856341         2.524754     columbus     columbus           2   0.003009   0.003009  94.80739        2.022633              APLXN PWR             APLXN PWR         0       150546        0000006879               appalachian power co        79913             733\n",
-       "640747      10.888046           0.999473  __splink__input_table_0  __splink__input_table_1       144743        80319      american crystal sugar /mn/      american crystal sugar                            1                    0.000010                    0.000029               6085.754919                         1.000000          101 n 3rd st                  None         [101, n, 3rd, st]                       NaN                  -1.0           1.000000      mn      mn            1    0.025996    0.025996  14.856341         1.822919     moorhead     moorhead           2   0.000089   0.000089  94.80739       68.263848     AMRKN KRSTL SKR MN       AMRKN KRSTL SKR         0       144743        0000004828     american crystal sugar co /mn/        80319             491\n",
-       "998578       9.990554           0.999018  __splink__input_table_0  __splink__input_table_1         2575        80977                    alabama power               alabama power                            2                    0.000067                    0.000067             872345.689655                         0.017018         600 n 18th st                  None        [600, n, 18th, st]                       NaN                  -1.0           1.000000      al      al            1    0.005280    0.005280  14.856341         8.975778   birmingham   birmingham           2   0.001995   0.001995  94.80739        3.050898               ALBM PWR              ALBM PWR         0         2575        0000003153                   alabama power co        80977             195\n",
-       "912914       9.434494           0.998557  __splink__input_table_0  __splink__input_table_1       132976        79317  air products and chemicals /de/  air products and chemicals                            1                    0.000019                    0.000048               6085.754919                         1.000000    7201 hamilton blvd    7201 hamilton blvd    [7201, hamilton, blvd]    [7201, hamilton, blvd]                   2.0           5.407499      pa      pa            1    0.030197    0.030197  14.856341         1.569346    allentown    allentown           2   0.001137   0.001137  94.80739        5.354027  AR PRTKTS ANT XMKLS T   AR PRTKTS ANT XMKLS         0       132976        0000002969  air products & chemicals inc /de/        79317             991\n",
-       "\n",
-       "[197 rows x 41 columns]"
-      ]
-     },
-     "execution_count": 437,
+       "    <tr>\n",
+       "      <th>224473</th>\n",
+       "      <td>20.054878</td>\n",
+       "      <td>0.999999</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1348</td>\n",
+       "      <td>376</td>\n",
+       "      <td>aetna life and casualty</td>\n",
+       "      <td>aetna life and casualty</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>652179.111493</td>\n",
+       "      <td>0.031739</td>\n",
+       "      <td>151 farmington ave</td>\n",
+       "      <td>151 farmington ave</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>9450.378101</td>\n",
+       "      <td>0.105707</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.020876</td>\n",
+       "      <td>0.020876</td>\n",
+       "      <td>15.873789</td>\n",
+       "      <td>2.448667</td>\n",
+       "      <td>hartford</td>\n",
+       "      <td>hartford</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.001198</td>\n",
+       "      <td>0.001198</td>\n",
+       "      <td>108.031428</td>\n",
+       "      <td>5.283275</td>\n",
+       "      <td>ETN LF ANT KSLT</td>\n",
+       "      <td>ETN LF ANT KSLT</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1348</td>\n",
+       "      <td>0000002648</td>\n",
+       "      <td>0000002648</td>\n",
+       "      <td>aetna life &amp; casualty co</td>\n",
+       "      <td>376</td>\n",
+       "      <td>211</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2085 rows × 43 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                company_name_no_legal_l          company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal        street_address_l        street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state     city_l     city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city      company_name_mphone_l    company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                           company_name_raw  record_id_y  utility_id_eia\n",
+       "466134      3.824596           0.934073  __splink__input_table_0  __splink__input_table_1        14692         6293                                  crane  entergy nuclear power marketing                            0                    0.000012                    0.000012                  0.986045                         1.000000   100 first stamford pl   100 first stamford pl                     2             0.000122             0.000122        9450.378101                  0.095137      ct      ct            1    0.020876    0.020876  15.873789         2.448667   stamford   stamford           2   0.003950   0.003950  108.031428        1.602975                        KRN    ENTRJ NKLR PWR MRKTNK         1        14692     0001944013        0001944013                                   crane co         6293           55243\n",
+       "466594      4.620005           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5535  dte electric securitization funding i       dte sustainable generation                            0                    0.000012                    0.000012                  0.986045                         1.000000          one energy plz          one energy plz                     2             0.000330             0.000330        9450.378101                  0.035236      mi      mi            1    0.015147    0.015147  15.873789         3.374867    detroit    detroit           2   0.001162   0.001162  108.031428        5.450115  TT ELKTRK SKRTSXN FNTNK I          TT SSTNBL JNRXN         1        17752     0001876068        0001876068  dte electric securitization funding i llc         5535           64331\n",
+       "480747      4.620005           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5522  dte electric securitization funding i                     dte electric                            0                    0.000012                    0.000037                  0.986045                         1.000000          one energy plz          one energy plz                     2             0.000330             0.000330        9450.378101                  0.035236      mi      mi            1    0.015147    0.015147  15.873789         3.374867    detroit    detroit           2   0.001162   0.001162  108.031428        5.450115  TT ELKTRK SKRTSXN FNTNK I                TT ELKTRK         0        17752     0001876068        0001876068  dte electric securitization funding i llc         5522            5109\n",
+       "464506      6.019599           0.984820  __splink__input_table_0  __splink__input_table_1        14051        10935                   constellation energy      luminace solar rhode island                            0                    0.000024                    0.000024                  0.986045                         1.000000              1310 pt st              1310 pt st                     2             0.000024             0.000024        9450.378101                  0.475683      md      md            1    0.025130    0.025130  15.873789         2.034167  baltimore  baltimore           2   0.003583   0.003583  108.031428        1.767102               KNSTLXN ENRJ       LMNS SLR RHT ISLNT         1        14051     0001868275        0001868275                  constellation energy corp        10935           62679\n",
+       "340973      6.201744           0.986596  __splink__input_table_0  __splink__input_table_1        14051         4420                   constellation energy          constellation newenergy                            1                    0.000024                    0.000024               5704.210475                         1.000000              1310 pt st   100 constellation way                     0             0.000024             0.000183           0.881657                  1.000000      md      md            1    0.025130    0.025130  15.873789         2.034167  baltimore  baltimore           2   0.003583   0.003583  108.031428        1.767102               KNSTLXN ENRJ            KNSTLXN NWNRJ         0        14051     0001868275        0001868275                  constellation energy corp         4420           58491\n",
+       "...              ...                ...                      ...                      ...          ...          ...                                    ...                              ...                          ...                         ...                         ...                       ...                              ...                     ...                     ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...        ...        ...         ...        ...        ...         ...             ...                        ...                      ...       ...          ...            ...               ...                                        ...          ...             ...\n",
+       "464642      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585         6561        air products and chemicals /de/          exelon gen extexlaporte                            0                    0.000024                    0.000012                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T      EKSLN JN EKSTKSLPRT         1         1585     0000002969        0000002969          air products & chemicals inc /de/         6561            6081\n",
+       "227094     20.402617           0.999999  __splink__input_table_0  __splink__input_table_1         1586          430             air products and chemicals       air products and chemicals                            2                    0.000037                    0.000037             652179.111493                         0.021160  1940 air products blvd  1940 air products blvd                     2             0.000049             0.000049        9450.378101                  0.237842      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158        AR PRTKTS ANT XMKLS      AR PRTKTS ANT XMKLS         0         1586     0000002969        0000002969             air products & chemicals, inc.          430             991\n",
+       "224504      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585          435        air products and chemicals /de/                     air products                            0                    0.000024                    0.000037                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T                AR PRTKTS         0         1585     0000002969        0000002969          air products & chemicals inc /de/          435             980\n",
+       "225982      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585          432        air products and chemicals /de/  air products energy enterprises                            0                    0.000024                    0.000012                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T  AR PRTKTS ENRJ ENTRPRSS         0         1585     0000002969        0000002969          air products & chemicals inc /de/          432             353\n",
+       "224473     20.054878           0.999999  __splink__input_table_0  __splink__input_table_1         1348          376                aetna life and casualty          aetna life and casualty                            2                    0.000024                    0.000024             652179.111493                         0.031739      151 farmington ave      151 farmington ave                     2             0.000110             0.000110        9450.378101                  0.105707      ct      ct            1    0.020876    0.020876  15.873789         2.448667   hartford   hartford           2   0.001198   0.001198  108.031428        5.283275            ETN LF ANT KSLT          ETN LF ANT KSLT         0         1348     0000002648        0000002648                   aetna life & casualty co          376             211\n",
+       "\n",
+       "[2085 rows x 43 columns]"
+      ]
+     },
+     "execution_count": 139,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3394,7 +3019,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 140,
    "id": "11190456-12a9-49df-b863-7a6f674e39eb",
    "metadata": {},
    "outputs": [],
@@ -3404,7 +3029,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 141,
    "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81",
    "metadata": {},
    "outputs": [],
@@ -3414,7 +3039,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 438,
+   "execution_count": 142,
    "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b",
    "metadata": {},
    "outputs": [],
@@ -3429,7 +3054,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 439,
+   "execution_count": 143,
    "id": "4d45f339-7a5b-466a-81f5-c71e425a77df",
    "metadata": {},
    "outputs": [],
@@ -3439,7 +3064,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 440,
+   "execution_count": 144,
    "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5",
    "metadata": {},
    "outputs": [],
@@ -3452,7 +3077,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 441,
+   "execution_count": 145,
    "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5",
    "metadata": {},
    "outputs": [
@@ -3498,10 +3123,10 @@
        "      <td>alabama power co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>2575</td>\n",
-       "      <td>80977</td>\n",
-       "      <td>0.999018</td>\n",
-       "      <td>2</td>\n",
+       "      <td>1701.0</td>\n",
+       "      <td>478.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3512,10 +3137,10 @@
        "      <td>fluence energy, inc.</td>\n",
        "      <td>Fluence</td>\n",
        "      <td>0</td>\n",
-       "      <td>126809</td>\n",
-       "      <td>21615</td>\n",
-       "      <td>0.000002</td>\n",
-       "      <td>0</td>\n",
+       "      <td>21792.0</td>\n",
+       "      <td>6889.0</td>\n",
+       "      <td>0.016529</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3526,12 +3151,12 @@
        "      <td>georgia power co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>50428</td>\n",
-       "      <td>68242</td>\n",
-       "      <td>0.029853</td>\n",
-       "      <td>2</td>\n",
+       "      <td>23416.0</td>\n",
+       "      <td>7653.0</td>\n",
+       "      <td>0.999997</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
@@ -3540,10 +3165,10 @@
        "      <td>columbus southern power co /oh/</td>\n",
        "      <td>Columbus Southern Power Co</td>\n",
        "      <td>1</td>\n",
-       "      <td>129635</td>\n",
-       "      <td>96300</td>\n",
-       "      <td>0.997628</td>\n",
-       "      <td>1</td>\n",
+       "      <td>13310.0</td>\n",
+       "      <td>4281.0</td>\n",
+       "      <td>0.999981</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3554,10 +3179,10 @@
        "      <td>duke energy corp</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>37661</td>\n",
-       "      <td>71555</td>\n",
-       "      <td>0.926352</td>\n",
-       "      <td>2</td>\n",
+       "      <td>17793.0</td>\n",
+       "      <td>5564.0</td>\n",
+       "      <td>0.927294</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3568,10 +3193,10 @@
        "      <td>duke energy carolinas, llc</td>\n",
        "      <td>Duke Energy Carolinas LLC</td>\n",
        "      <td>1</td>\n",
-       "      <td>133261</td>\n",
-       "      <td>118543</td>\n",
-       "      <td>0.987916</td>\n",
-       "      <td>2</td>\n",
+       "      <td>17790.0</td>\n",
+       "      <td>5558.0</td>\n",
+       "      <td>0.999987</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3582,10 +3207,10 @@
        "      <td>berkshire realty co inc /de</td>\n",
        "      <td>Berkshire Wind Power Cooperative Corp</td>\n",
        "      <td>0</td>\n",
-       "      <td>198821</td>\n",
-       "      <td>89415</td>\n",
-       "      <td>0.000030</td>\n",
-       "      <td>0</td>\n",
+       "      <td>7449.0</td>\n",
+       "      <td>1712.0</td>\n",
+       "      <td>0.001912</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3596,10 +3221,10 @@
        "      <td>southern co</td>\n",
        "      <td>southern co services inc</td>\n",
        "      <td>0</td>\n",
-       "      <td>50417</td>\n",
-       "      <td>111824</td>\n",
-       "      <td>0.000063</td>\n",
-       "      <td>0</td>\n",
+       "      <td>50962.0</td>\n",
+       "      <td>17068.0</td>\n",
+       "      <td>0.007216</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3610,10 +3235,10 @@
        "      <td>southern co</td>\n",
        "      <td>Southern Power Co</td>\n",
        "      <td>0</td>\n",
-       "      <td>50417</td>\n",
-       "      <td>49613</td>\n",
-       "      <td>0.004315</td>\n",
-       "      <td>0</td>\n",
+       "      <td>50963.0</td>\n",
+       "      <td>17089.0</td>\n",
+       "      <td>0.034232</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3624,12 +3249,12 @@
        "      <td>pacific gas &amp; electric co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>2898</td>\n",
-       "      <td>55480</td>\n",
-       "      <td>0.624991</td>\n",
-       "      <td>2</td>\n",
+       "      <td>41598.0</td>\n",
+       "      <td>13933.0</td>\n",
+       "      <td>0.999948</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
@@ -3638,10 +3263,10 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>FirstEnergy</td>\n",
        "      <td>0</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>69716</td>\n",
-       "      <td>0.999707</td>\n",
-       "      <td>2</td>\n",
+       "      <td>21579.0</td>\n",
+       "      <td>6776.0</td>\n",
+       "      <td>0.999998</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3652,12 +3277,12 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>FirstEnergy Nuclear Generation Corp</td>\n",
        "      <td>0</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>102163</td>\n",
-       "      <td>0.000066</td>\n",
-       "      <td>0</td>\n",
-       "      <td>both</td>\n",
+       "      <td>21579.0</td>\n",
+       "      <td>6780.0</td>\n",
+       "      <td>0.986543</td>\n",
        "      <td>0.0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
@@ -3666,10 +3291,10 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>First Energy Services</td>\n",
        "      <td>0</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>162033</td>\n",
-       "      <td>0.000066</td>\n",
-       "      <td>0</td>\n",
+       "      <td>21579.0</td>\n",
+       "      <td>6763.0</td>\n",
+       "      <td>0.085467</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3680,11 +3305,11 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>First Energy Corp</td>\n",
        "      <td>1</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>121855</td>\n",
-       "      <td>0.010697</td>\n",
-       "      <td>1</td>\n",
-       "      <td>both</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>left_only</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -3694,10 +3319,10 @@
        "      <td>tucson electric power co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>715</td>\n",
-       "      <td>41507</td>\n",
-       "      <td>0.999798</td>\n",
-       "      <td>2</td>\n",
+       "      <td>55725.0</td>\n",
+       "      <td>18901.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3708,10 +3333,10 @@
        "      <td>tampa electric co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>231716</td>\n",
-       "      <td>47982</td>\n",
-       "      <td>0.989228</td>\n",
-       "      <td>2</td>\n",
+       "      <td>53604.0</td>\n",
+       "      <td>18180.0</td>\n",
+       "      <td>0.991059</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3722,10 +3347,10 @@
        "      <td>dominion energy, inc</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>15937</td>\n",
-       "      <td>71878</td>\n",
-       "      <td>0.998282</td>\n",
-       "      <td>2</td>\n",
+       "      <td>17484.0</td>\n",
+       "      <td>5386.0</td>\n",
+       "      <td>0.999985</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3736,10 +3361,10 @@
        "      <td>nrg energy, inc</td>\n",
        "      <td>NRG Energy Gas &amp; Wind Holdings Inc</td>\n",
        "      <td>0</td>\n",
-       "      <td>7168</td>\n",
-       "      <td>17454</td>\n",
-       "      <td>0.002575</td>\n",
-       "      <td>0</td>\n",
+       "      <td>40084.0</td>\n",
+       "      <td>13240.0</td>\n",
+       "      <td>0.300167</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -3750,10 +3375,10 @@
        "      <td>nrg energy inc</td>\n",
        "      <td>NRG Energy Inc</td>\n",
        "      <td>1</td>\n",
-       "      <td>7173</td>\n",
-       "      <td>95029</td>\n",
-       "      <td>0.988801</td>\n",
-       "      <td>2</td>\n",
+       "      <td>40084.0</td>\n",
+       "      <td>13243.0</td>\n",
+       "      <td>0.999820</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3764,10 +3389,10 @@
        "      <td>oglethorpe power corp</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>172902</td>\n",
-       "      <td>56478</td>\n",
-       "      <td>0.999768</td>\n",
-       "      <td>2</td>\n",
+       "      <td>40576.0</td>\n",
+       "      <td>13515.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -3778,43 +3403,88 @@
        "      <td>central maine power co</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>126771</td>\n",
-       "      <td>176663</td>\n",
-       "      <td>0.897700</td>\n",
-       "      <td>2</td>\n",
+       "      <td>10876.0</td>\n",
+       "      <td>3424.0</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>0001032208</td>\n",
+       "      <td>61296</td>\n",
+       "      <td>sempra energy</td>\n",
+       "      <td>Sempra Generation</td>\n",
+       "      <td>1</td>\n",
+       "      <td>49303.0</td>\n",
+       "      <td>16270.0</td>\n",
+       "      <td>0.559074</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>0000004904</td>\n",
+       "      <td>488</td>\n",
+       "      <td>american electric power co inc</td>\n",
+       "      <td>American Electric Power Inc</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2926.0</td>\n",
+       "      <td>793.0</td>\n",
+       "      <td>0.996076</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>0000715957</td>\n",
+       "      <td>5248</td>\n",
+       "      <td>dominion energy, inc</td>\n",
+       "      <td>Dominion Energy Inc.</td>\n",
+       "      <td>1</td>\n",
+       "      <td>17484.0</td>\n",
+       "      <td>5386.0</td>\n",
+       "      <td>0.999985</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "   central_index_key  utility_id_eia                 sec_company_name                       eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal _merge  predicted_match\n",
-       "0         0000003153             195                 alabama power co                                    NaN      1         2575        80977           0.999018                            2   both              1.0\n",
-       "1         0001868941           58702             fluence energy, inc.                                Fluence      0       126809        21615           0.000002                            0   both              0.0\n",
-       "2         0000041091            7140                 georgia power co                                    NaN      1        50428        68242           0.029853                            2   both              0.0\n",
-       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1       129635        96300           0.997628                            1   both              1.0\n",
-       "4         0001326160            5416                 duke energy corp                                    NaN      1        37661        71555           0.926352                            2   both              0.0\n",
-       "5         0000030371           54905       duke energy carolinas, llc              Duke Energy Carolinas LLC      1       133261       118543           0.987916                            2   both              1.0\n",
-       "6         0000869446           57140      berkshire realty co inc /de  Berkshire Wind Power Cooperative Corp      0       198821        89415           0.000030                            0   both              0.0\n",
-       "7         0000092122           18195                      southern co               southern co services inc      0        50417       111824           0.000063                            0   both              0.0\n",
-       "8         0000092122           17650                      southern co                      Southern Power Co      0        50417        49613           0.004315                            0   both              0.0\n",
-       "9         0000075488           14328        pacific gas & electric co                                    NaN      1         2898        55480           0.624991                            2   both              0.0\n",
-       "10        0001031296            6526                 firstenergy corp                            FirstEnergy      0        14192        69716           0.999707                            2   both              1.0\n",
-       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0        14192       102163           0.000066                            0   both              0.0\n",
-       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0        14192       162033           0.000066                            0   both              0.0\n",
-       "13        0001031296           32208                 firstenergy corp                      First Energy Corp      1        14192       121855           0.010697                            1   both              0.0\n",
-       "14        0000100122           24211         tucson electric power co                                    NaN      1          715        41507           0.999798                            2   both              1.0\n",
-       "15        0000096271           18454                tampa electric co                                    NaN      1       231716        47982           0.989228                            2   both              1.0\n",
-       "16        0000715957            5248             dominion energy, inc                                    NaN      1        15937        71878           0.998282                            2   both              1.0\n",
-       "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0         7168        17454           0.002575                            0   both              0.0\n",
-       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1         7173        95029           0.988801                            2   both              1.0\n",
-       "19        0000788816           13994            oglethorpe power corp                                    NaN      1       172902        56478           0.999768                            2   both              1.0\n",
-       "20        0000018675            3266           central maine power co                                    NaN      1       126771       176663           0.897700                            2   both              0.0"
+       "   central_index_key  utility_id_eia                 sec_company_name                       eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal     _merge  predicted_match\n",
+       "0         0000003153             195                 alabama power co                                    NaN      1       1701.0        478.0           1.000000                          2.0       both              1.0\n",
+       "1         0001868941           58702             fluence energy, inc.                                Fluence      0      21792.0       6889.0           0.016529                          0.0       both              0.0\n",
+       "2         0000041091            7140                 georgia power co                                    NaN      1      23416.0       7653.0           0.999997                          2.0       both              1.0\n",
+       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1      13310.0       4281.0           0.999981                          1.0       both              1.0\n",
+       "4         0001326160            5416                 duke energy corp                                    NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
+       "5         0000030371           54905       duke energy carolinas, llc              Duke Energy Carolinas LLC      1      17790.0       5558.0           0.999987                          2.0       both              1.0\n",
+       "6         0000869446           57140      berkshire realty co inc /de  Berkshire Wind Power Cooperative Corp      0       7449.0       1712.0           0.001912                          0.0       both              0.0\n",
+       "7         0000092122           18195                      southern co               southern co services inc      0      50962.0      17068.0           0.007216                          0.0       both              0.0\n",
+       "8         0000092122           17650                      southern co                      Southern Power Co      0      50963.0      17089.0           0.034232                          0.0       both              0.0\n",
+       "9         0000075488           14328        pacific gas & electric co                                    NaN      1      41598.0      13933.0           0.999948                          2.0       both              1.0\n",
+       "10        0001031296            6526                 firstenergy corp                            FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
+       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
+       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0      21579.0       6763.0           0.085467                          0.0       both              0.0\n",
+       "13        0001031296           32208                 firstenergy corp                      First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
+       "14        0000100122           24211         tucson electric power co                                    NaN      1      55725.0      18901.0           1.000000                          2.0       both              1.0\n",
+       "15        0000096271           18454                tampa electric co                                    NaN      1      53604.0      18180.0           0.991059                          2.0       both              1.0\n",
+       "16        0000715957            5248             dominion energy, inc                                    NaN      1      17484.0       5386.0           0.999985                          2.0       both              1.0\n",
+       "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0      40084.0      13240.0           0.300167                          0.0       both              0.0\n",
+       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1      40084.0      13243.0           0.999820                          2.0       both              1.0\n",
+       "19        0000788816           13994            oglethorpe power corp                                    NaN      1      40576.0      13515.0           1.000000                          2.0       both              1.0\n",
+       "20        0000018675            3266           central maine power co                                    NaN      1      10876.0       3424.0           1.000000                          2.0       both              1.0\n",
+       "21        0001032208           61296                    sempra energy                      Sempra Generation      1      49303.0      16270.0           0.559074                          0.0       both              0.0\n",
+       "22        0000004904             488   american electric power co inc            American Electric Power Inc      1       2926.0        793.0           0.996076                          2.0       both              1.0\n",
+       "23        0000715957            5248             dominion energy, inc                   Dominion Energy Inc.      1      17484.0       5386.0           0.999985                          2.0       both              1.0"
       ]
      },
-     "execution_count": 441,
+     "execution_count": 145,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3825,7 +3495,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 442,
+   "execution_count": 146,
    "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea",
    "metadata": {},
    "outputs": [],
@@ -3841,19 +3511,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 443,
+   "execution_count": 147,
    "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(np.float64(0.8888888888888888),\n",
-       " np.float64(0.6153846153846154),\n",
-       " 0.7142857142857143)"
+       "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)"
       ]
      },
-     "execution_count": 443,
+     "execution_count": 147,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3864,7 +3532,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 444,
+   "execution_count": 148,
    "id": "08932be5-b90c-440d-9efb-156cb4d63c93",
    "metadata": {},
    "outputs": [
@@ -3896,13 +3564,13 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>Negative</th>\n",
-       "      <td>7</td>\n",
-       "      <td>1</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>Positive</th>\n",
-       "      <td>5</td>\n",
-       "      <td>8</td>\n",
+       "      <td>3</td>\n",
+       "      <td>13</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -3910,11 +3578,11 @@
       ],
       "text/plain": [
        "          Predicted Negative  Predicted Positive\n",
-       "Negative                   7                   1\n",
-       "Positive                   5                   8"
+       "Negative                   6                   2\n",
+       "Positive                   3                  13"
       ]
      },
-     "execution_count": 444,
+     "execution_count": 148,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3929,7 +3597,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 445,
+   "execution_count": 149,
    "id": "025c80e9-5055-4eaa-a873-38b910cd7f94",
    "metadata": {},
    "outputs": [],
@@ -3939,7 +3607,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 446,
+   "execution_count": 150,
    "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768",
    "metadata": {},
    "outputs": [
@@ -3979,44 +3647,16 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0000041091</td>\n",
-       "      <td>7140</td>\n",
-       "      <td>georgia power co</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>50428</td>\n",
-       "      <td>68242</td>\n",
-       "      <td>0.029853</td>\n",
-       "      <td>2</td>\n",
-       "      <td>both</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>0001326160</td>\n",
        "      <td>5416</td>\n",
        "      <td>duke energy corp</td>\n",
        "      <td>NaN</td>\n",
        "      <td>1</td>\n",
-       "      <td>37661</td>\n",
-       "      <td>71555</td>\n",
-       "      <td>0.926352</td>\n",
-       "      <td>2</td>\n",
-       "      <td>both</td>\n",
-       "      <td>0.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>9</th>\n",
-       "      <td>0000075488</td>\n",
-       "      <td>14328</td>\n",
-       "      <td>pacific gas &amp; electric co</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2898</td>\n",
-       "      <td>55480</td>\n",
-       "      <td>0.624991</td>\n",
-       "      <td>2</td>\n",
+       "      <td>17793.0</td>\n",
+       "      <td>5564.0</td>\n",
+       "      <td>0.927294</td>\n",
+       "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -4027,10 +3667,24 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>FirstEnergy</td>\n",
        "      <td>0</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>69716</td>\n",
-       "      <td>0.999707</td>\n",
-       "      <td>2</td>\n",
+       "      <td>21579.0</td>\n",
+       "      <td>6776.0</td>\n",
+       "      <td>0.999998</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>both</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>0001031296</td>\n",
+       "      <td>54776</td>\n",
+       "      <td>firstenergy corp</td>\n",
+       "      <td>FirstEnergy Nuclear Generation Corp</td>\n",
+       "      <td>0</td>\n",
+       "      <td>21579.0</td>\n",
+       "      <td>6780.0</td>\n",
+       "      <td>0.986543</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
        "    </tr>\n",
@@ -4041,24 +3695,24 @@
        "      <td>firstenergy corp</td>\n",
        "      <td>First Energy Corp</td>\n",
        "      <td>1</td>\n",
-       "      <td>14192</td>\n",
-       "      <td>121855</td>\n",
-       "      <td>0.010697</td>\n",
-       "      <td>1</td>\n",
-       "      <td>both</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>left_only</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>0000018675</td>\n",
-       "      <td>3266</td>\n",
-       "      <td>central maine power co</td>\n",
-       "      <td>NaN</td>\n",
+       "      <th>21</th>\n",
+       "      <td>0001032208</td>\n",
+       "      <td>61296</td>\n",
+       "      <td>sempra energy</td>\n",
+       "      <td>Sempra Generation</td>\n",
        "      <td>1</td>\n",
-       "      <td>126771</td>\n",
-       "      <td>176663</td>\n",
-       "      <td>0.897700</td>\n",
-       "      <td>2</td>\n",
+       "      <td>49303.0</td>\n",
+       "      <td>16270.0</td>\n",
+       "      <td>0.559074</td>\n",
+       "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
        "    </tr>\n",
@@ -4067,16 +3721,15 @@
        "</div>"
       ],
       "text/plain": [
-       "   central_index_key  utility_id_eia           sec_company_name   eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal _merge  predicted_match\n",
-       "2         0000041091            7140           georgia power co                NaN      1        50428        68242           0.029853                            2   both              0.0\n",
-       "4         0001326160            5416           duke energy corp                NaN      1        37661        71555           0.926352                            2   both              0.0\n",
-       "9         0000075488           14328  pacific gas & electric co                NaN      1         2898        55480           0.624991                            2   both              0.0\n",
-       "10        0001031296            6526           firstenergy corp        FirstEnergy      0        14192        69716           0.999707                            2   both              1.0\n",
-       "13        0001031296           32208           firstenergy corp  First Energy Corp      1        14192       121855           0.010697                            1   both              0.0\n",
-       "20        0000018675            3266     central maine power co                NaN      1       126771       176663           0.897700                            2   both              0.0"
+       "   central_index_key  utility_id_eia  sec_company_name                     eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal     _merge  predicted_match\n",
+       "4         0001326160            5416  duke energy corp                                  NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
+       "10        0001031296            6526  firstenergy corp                          FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
+       "11        0001031296           54776  firstenergy corp  FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
+       "13        0001031296           32208  firstenergy corp                    First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
+       "21        0001032208           61296     sempra energy                    Sempra Generation      1      49303.0      16270.0           0.559074                          0.0       both              0.0"
       ]
      },
-     "execution_count": 446,
+     "execution_count": 150,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4087,7 +3740,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 447,
+   "execution_count": 151,
    "id": "c425a676-aa6e-4d8f-b814-931da392c2ff",
    "metadata": {},
    "outputs": [],
@@ -4105,7 +3758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 448,
+   "execution_count": 152,
    "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1",
    "metadata": {},
    "outputs": [
@@ -4114,23 +3767,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed {\n",
+       "  #altair-viz-fe735af639b44651936e0c828a9addec.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed details,\n",
-       "  #altair-viz-e70cd055b0c84ec9b321e88181d19e2b.vega-embed details summary {\n",
+       "  #altair-viz-fe735af639b44651936e0c828a9addec.vega-embed details,\n",
+       "  #altair-viz-fe735af639b44651936e0c828a9addec.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\"></div>\n",
+       "<div id=\"altair-viz-fe735af639b44651936e0c828a9addec\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-e70cd055b0c84ec9b321e88181d19e2b\");\n",
+       "    if (outputDiv.id !== \"altair-viz-fe735af639b44651936e0c828a9addec\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-fe735af639b44651936e0c828a9addec\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -4176,14 +3829,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"layer\": [{\"mark\": \"rule\", \"encoding\": {\"color\": {\"value\": \"black\"}, \"size\": {\"value\": 0.5}, \"y\": {\"field\": \"zero\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"bar\", \"width\": 60}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"(datum.log2_bayes_factor < 0)\", \"value\": \"red\"}, \"value\": \"green\"}, \"opacity\": {\"condition\": {\"test\": \"datum.column_name == 'Prior match weight' || datum.column_name == 'Final score'\", \"value\": 1}, \"value\": 0.5}, \"tooltip\": [{\"field\": \"column_name\", \"title\": \"Comparison column\", \"type\": \"nominal\"}, {\"field\": \"value_l\", \"title\": \"Value (L)\", \"type\": \"nominal\"}, {\"field\": \"value_r\", \"title\": \"Value (R)\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"comparison_vector_value\", \"title\": \"Comparison vector value\", \"type\": \"nominal\"}, {\"field\": \"bayes_factor\", \"format\": \",.4f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"prob\", \"format\": \".4f\", \"title\": \"Cumulative match probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"grid\": true, \"labelAlign\": \"center\", \"labelAngle\": -20, \"labelExpr\": \"datum.value == 'Prior' || datum.value == 'Final score' ? '' : datum.value\", \"labelPadding\": 10, \"tickBand\": \"extent\", \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"grid\": false, \"orient\": \"left\", \"title\": \"Match Weight\"}, \"field\": \"previous_sum\", \"type\": \"quantitative\"}, \"y2\": {\"field\": \"sum\"}}}, {\"mark\": {\"type\": \"text\", \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"white\"}, \"text\": {\"condition\": {\"test\": \"abs(datum.log2_bayes_factor) > 1\", \"field\": \"log2_bayes_factor\", \"format\": \".2f\", \"type\": \"nominal\"}, \"value\": \"\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"orient\": \"left\"}, \"field\": \"center\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -25, \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"black\"}, \"text\": {\"field\": \"column_name\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -13, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_l\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -5, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_r\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}]}, {\"mark\": {\"type\": \"rule\", \"color\": \"black\", \"strokeWidth\": 2, \"x2Offset\": 30, \"xOffset\": -30}, \"encoding\": {\"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"x2\": {\"field\": \"lead\"}, \"y\": {\"axis\": {\"labelExpr\": \"format(1 / (1 + pow(2, -1*datum.value)), '.2r')\", \"orient\": \"right\", \"title\": \"Probability\"}, \"field\": \"sum\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}}], \"data\": {\"name\": \"data-2a99b4d425314fc5f97f7ffbd603dba9\"}, \"height\": 450, \"params\": [{\"name\": \"record_number\", \"bind\": {\"input\": \"range\", \"max\": 5, \"min\": 0, \"step\": 1}, \"value\": 0}], \"resolve\": {\"axis\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Match weights waterfall chart\", \"subtitle\": \"How each comparison contributes to the final match score\"}, \"transform\": [{\"filter\": \"(datum.record_number == record_number)\"}, {\"filter\": \"(datum.bayes_factor !== 1.0)\"}, {\"window\": [{\"op\": \"sum\", \"field\": \"log2_bayes_factor\", \"as\": \"sum\"}, {\"op\": \"lead\", \"field\": \"column_name\", \"as\": \"lead\"}], \"frame\": [null, 0]}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" ? datum.sum - datum.log2_bayes_factor : datum.sum\", \"as\": \"sum\"}, {\"calculate\": \"datum.lead === null ? datum.column_name : datum.lead\", \"as\": \"lead\"}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" || datum.column_name === \\\"Prior match weight\\\" ? 0 : datum.sum - datum.log2_bayes_factor\", \"as\": \"previous_sum\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"top_label\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"bottom_label\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_top\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_bottom\"}, {\"calculate\": \"(datum.sum + datum.previous_sum) / 2\", \"as\": \"center\"}, {\"calculate\": \"(datum.log2_bayes_factor > 0 ? \\\"+\\\" : \\\"\\\") + datum.log2_bayes_factor\", \"as\": \"text_log2_bayes_factor\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? 4 : -4\", \"as\": \"dy\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? \\\"top\\\" : \\\"bottom\\\"\", \"as\": \"baseline\"}, {\"calculate\": \"1. / (1 + pow(2, -1.*datum.sum))\", \"as\": \"prob\"}, {\"calculate\": \"0*datum.sum\", \"as\": \"zero\"}], \"width\": {\"step\": 75}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-2a99b4d425314fc5f97f7ffbd603dba9\": [{\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"georgia power\", \"value_r\": \"georgia power\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.019854514334617032, \"log2_bayes_factor\": -5.654389118542616, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  50.37 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"georgia power\", \"value_r\": \"georgia power\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 0}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"241 ralph mcgill boulevard, ['241' 'ralph' 'mcgill' 'boulevard']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"ga\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"atlanta\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 0}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": -5.022258674429072, \"bayes_factor\": 0.030771558522274426, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 0}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.009163622000592475, \"log2_bayes_factor\": -6.7698663359625515, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  109.13 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 1}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"526 south church street, ['526' 'south' 'church' 'street']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 1.462263504865088, \"log2_bayes_factor\": 0.5482033132788914, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 1.46 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.4300084911960021, \"log2_bayes_factor\": -1.2175629465018993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  2.33 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 1}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 3.652839035481583, \"bayes_factor\": 12.57807323388316, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 1}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 2}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"pacific gas and electric\", \"value_r\": \"pacific gas and electric\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 2}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.019854514334617032, \"log2_bayes_factor\": -5.654389118542616, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  50.37 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"pacific gas and electric\", \"value_r\": \"pacific gas and electric\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 2}, {\"sql_condition\": \"LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\")) <> 0 AND ARRAY_LENGTH(ARRAY_INTERSECT(\\\"street_address_list_l\\\", \\\"street_address_list_r\\\")) = LEAST(ARRAY_LENGTH(\\\"street_address_list_l\\\"), ARRAY_LENGTH(\\\"street_address_list_r\\\"))\", \"label_for_charts\": \"Array subset\", \"m_probability\": 0.16666664895266775, \"u_probability\": 0.6267523364485982, \"bayes_factor\": 0.2659210652441446, \"log2_bayes_factor\": -1.9109300284119297, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `array subset` then comparison is 3.761 times less likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"77 beale st, ['77' 'beale' 'st']\", \"value_r\": \"77 beale st  rm 1279amc n12e, ['77' 'beale' 'st' 'rm' '1279amc' 'n12e']\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.31774355245472224, \"log2_bayes_factor\": -1.6540652440425645, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison  3.15 times less likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"san francisco\", \"value_r\": \"san francisco\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.45509231984910226, \"log2_bayes_factor\": -1.135768855338252, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  2.20 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"san francisco\", \"value_r\": \"san francisco\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 2}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 0.7369117583317809, \"bayes_factor\": 1.6666044742323236, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 2}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 3}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 3}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.014890885750962774, \"log2_bayes_factor\": -6.069426617821459, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  67.16 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 3}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.7549019576781999, \"u_probability\": 0.13960280373831777, \"bayes_factor\": 5.407498542029615, \"log2_bayes_factor\": 2.434961371207702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 5.407 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"value_r\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 2.524753729087804, \"log2_bayes_factor\": 1.3361426705767852, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 2.52 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.5770511573326839, \"u_probability\": 0.0060865630580326634, \"bayes_factor\": 94.80738995567096, \"log2_bayes_factor\": 6.566927612205768, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 94.81 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 7.691701180548207, \"log2_bayes_factor\": 2.9433027156529468, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison 7.69 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 3}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 11.737045144283117, \"bayes_factor\": 3413.5215495150405, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 3}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 4}, {\"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"bayes_factor\": 6085.754919480062, \"log2_bayes_factor\": 12.571220520655558, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"first energy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 4}, {\"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0022252026790704244, \"u_probability\": 3.6564119135782337e-07, \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,086 times more likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 4}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st, ['76' 'south' 'main' 'st']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 4}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 4}, {\"sql_condition\": \"\\\"state_l\\\" IS NULL OR \\\"state_r\\\" IS NULL\", \"label_for_charts\": \"state is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `state is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 4}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 4}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 4}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": -6.5311894633876015, \"bayes_factor\": 0.010812249633775993, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 4}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -19.10240998404316, \"bayes_factor\": 1.7766488754200009e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 5}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 1.0, \"u_probability\": 1.1463345458785815e-06, \"bayes_factor\": 872345.6896551723, \"log2_bayes_factor\": 19.734540428156702, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 872,346 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"central maine power\", \"value_r\": \"central maine power\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 5}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.02978177150192555, \"log2_bayes_factor\": -5.069426617821459, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  33.58 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"central maine power\", \"value_r\": \"central maine power\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 5}, {\"sql_condition\": \"\\\"street_address_l\\\" IS NULL OR \\\"street_address_r\\\" IS NULL\", \"label_for_charts\": \"street_address is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `street_address is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"edison dr, ['edison' 'dr']\", \"value_r\": \"None, nan\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 5}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.7040254501446316, \"u_probability\": 0.047388885407471534, \"bayes_factor\": 14.856341188257446, \"log2_bayes_factor\": 3.8930069483478316, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 14.86 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"me\", \"value_r\": \"me\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 4, \"record_number\": 5}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 12.796894615433612, \"log2_bayes_factor\": 3.67772185304552, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 12.80 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"me\", \"value_r\": \"me\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 5, \"record_number\": 5}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"augusta\", \"value_r\": \"None\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 6, \"record_number\": 5}, {\"sql_condition\": \"\\\"city_l\\\" IS NULL OR \\\"city_r\\\" IS NULL\", \"label_for_charts\": \"city is NULL\", \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": -1, \"bayes_factor_description\": \"If comparison level is `city is null` then comparison is 1 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 7, \"record_number\": 5}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 3.133432627685436, \"bayes_factor\": 8.775203775440346, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 8, \"record_number\": 5}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"layer\": [{\"layer\": [{\"mark\": \"rule\", \"encoding\": {\"color\": {\"value\": \"black\"}, \"size\": {\"value\": 0.5}, \"y\": {\"field\": \"zero\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"bar\", \"width\": 60}, \"encoding\": {\"color\": {\"condition\": {\"test\": \"(datum.log2_bayes_factor < 0)\", \"value\": \"red\"}, \"value\": \"green\"}, \"opacity\": {\"condition\": {\"test\": \"datum.column_name == 'Prior match weight' || datum.column_name == 'Final score'\", \"value\": 1}, \"value\": 0.5}, \"tooltip\": [{\"field\": \"column_name\", \"title\": \"Comparison column\", \"type\": \"nominal\"}, {\"field\": \"value_l\", \"title\": \"Value (L)\", \"type\": \"nominal\"}, {\"field\": \"value_r\", \"title\": \"Value (R)\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"comparison_vector_value\", \"title\": \"Comparison vector value\", \"type\": \"nominal\"}, {\"field\": \"bayes_factor\", \"format\": \",.4f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"prob\", \"format\": \".4f\", \"title\": \"Cumulative match probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"grid\": true, \"labelAlign\": \"center\", \"labelAngle\": -20, \"labelExpr\": \"datum.value == 'Prior' || datum.value == 'Final score' ? '' : datum.value\", \"labelPadding\": 10, \"tickBand\": \"extent\", \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"grid\": false, \"orient\": \"left\", \"title\": \"Match Weight\"}, \"field\": \"previous_sum\", \"type\": \"quantitative\"}, \"y2\": {\"field\": \"sum\"}}}, {\"mark\": {\"type\": \"text\", \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"white\"}, \"text\": {\"condition\": {\"test\": \"abs(datum.log2_bayes_factor) > 1\", \"field\": \"log2_bayes_factor\", \"format\": \".2f\", \"type\": \"nominal\"}, \"value\": \"\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"axis\": {\"orient\": \"left\"}, \"field\": \"center\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -25, \"fontWeight\": \"bold\"}, \"encoding\": {\"color\": {\"value\": \"black\"}, \"text\": {\"field\": \"column_name\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -13, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_l\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}, {\"mark\": {\"type\": \"text\", \"baseline\": \"bottom\", \"dy\": -5, \"fontSize\": 8}, \"encoding\": {\"color\": {\"value\": \"grey\"}, \"text\": {\"field\": \"value_r\", \"type\": \"nominal\"}, \"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"y\": {\"field\": \"sum_top\", \"type\": \"quantitative\"}}}]}, {\"mark\": {\"type\": \"rule\", \"color\": \"black\", \"strokeWidth\": 2, \"x2Offset\": 30, \"xOffset\": -30}, \"encoding\": {\"x\": {\"axis\": {\"labelAngle\": -20, \"title\": \"Column\"}, \"field\": \"column_name\", \"sort\": {\"field\": \"bar_sort_order\", \"order\": \"ascending\"}, \"type\": \"nominal\"}, \"x2\": {\"field\": \"lead\"}, \"y\": {\"axis\": {\"labelExpr\": \"format(1 / (1 + pow(2, -1*datum.value)), '.2r')\", \"orient\": \"right\", \"title\": \"Probability\"}, \"field\": \"sum\", \"scale\": {\"zero\": false}, \"type\": \"quantitative\"}}}], \"data\": {\"name\": \"data-ce971900907c074f7a457e0558aa94bb\"}, \"height\": 450, \"params\": [{\"name\": \"record_number\", \"bind\": {\"input\": \"range\", \"max\": 3, \"min\": 0, \"step\": 1}, \"value\": 0}], \"resolve\": {\"axis\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Match weights waterfall chart\", \"subtitle\": \"How each comparison contributes to the final match score\"}, \"transform\": [{\"filter\": \"(datum.record_number == record_number)\"}, {\"filter\": \"(datum.bayes_factor !== 1.0)\"}, {\"window\": [{\"op\": \"sum\", \"field\": \"log2_bayes_factor\", \"as\": \"sum\"}, {\"op\": \"lead\", \"field\": \"column_name\", \"as\": \"lead\"}], \"frame\": [null, 0]}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" ? datum.sum - datum.log2_bayes_factor : datum.sum\", \"as\": \"sum\"}, {\"calculate\": \"datum.lead === null ? datum.column_name : datum.lead\", \"as\": \"lead\"}, {\"calculate\": \"datum.column_name === \\\"Final score\\\" || datum.column_name === \\\"Prior match weight\\\" ? 0 : datum.sum - datum.log2_bayes_factor\", \"as\": \"previous_sum\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"top_label\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.column_name : \\\"\\\"\", \"as\": \"bottom_label\"}, {\"calculate\": \"datum.sum > datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_top\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? datum.sum : datum.previous_sum\", \"as\": \"sum_bottom\"}, {\"calculate\": \"(datum.sum + datum.previous_sum) / 2\", \"as\": \"center\"}, {\"calculate\": \"(datum.log2_bayes_factor > 0 ? \\\"+\\\" : \\\"\\\") + datum.log2_bayes_factor\", \"as\": \"text_log2_bayes_factor\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? 4 : -4\", \"as\": \"dy\"}, {\"calculate\": \"datum.sum < datum.previous_sum ? \\\"top\\\" : \\\"bottom\\\"\", \"as\": \"baseline\"}, {\"calculate\": \"1. / (1 + pow(2, -1.*datum.sum))\", \"as\": \"prob\"}, {\"calculate\": \"0*datum.sum\", \"as\": \"zero\"}], \"width\": {\"step\": 75}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-ce971900907c074f7a457e0558aa94bb\": [{\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -18.684061249539493, \"bayes_factor\": 2.3743083676072958e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 0}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.007934832372731376, \"log2_bayes_factor\": -6.977584538927854, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  126.03 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"duke energy\", \"value_r\": \"duke energy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 0}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"bayes_factor\": 0.8816566831130892, \"log2_bayes_factor\": -0.18171111483340682, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"526 south church st\", \"value_r\": \"p o box 1244\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 0}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"column_name\": \"tf_street_address\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 4, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 5, \"record_number\": 0}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 1.5912118284393766, \"log2_bayes_factor\": 0.6701259057484746, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 1.59 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"nc\", \"value_r\": \"nc\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 6, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 7, \"record_number\": 0}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.43146742942272814, \"log2_bayes_factor\": -1.2126764374155323, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  2.32 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"charlotte\", \"value_r\": \"charlotte\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 8, \"record_number\": 0}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 3.672883184942062, \"bayes_factor\": 12.754046887304453, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 9, \"record_number\": 0}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -18.684061249539493, \"bayes_factor\": 2.3743083676072958e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 1}, {\"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Term freq adjustment on company_name_no_legal with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.012695731796370202, \"log2_bayes_factor\": -6.299512633815216, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on company_name_no_legal makes comparison  78.77 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 1}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st\", \"value_r\": \"76 south main st\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 1}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Term freq adjustment on street_address with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.07318200428972765, \"log2_bayes_factor\": -3.7723672613995376, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on street_address makes comparison  13.66 times less likely to be a match\", \"column_name\": \"tf_street_address\", \"value_l\": \"76 south main st\", \"value_r\": \"76 south main st\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 4, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 5, \"record_number\": 1}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 3.0085647667960136, \"log2_bayes_factor\": 1.5890754146802368, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 3.01 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 6, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 7, \"record_number\": 1}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 8.77560873402159, \"log2_bayes_factor\": 3.1334992037185074, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison 8.78 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 8, \"record_number\": 1}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 19.231580429639383, \"bayes_factor\": 615577.2672779278, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 9, \"record_number\": 1}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -18.684061249539493, \"bayes_factor\": 2.3743083676072958e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 2}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"firstenergy\", \"value_r\": \"firstenergy nuclear generation\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 2}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 2}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"76 south main st\", \"value_r\": \"76 south main st\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 2}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Term freq adjustment on street_address with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.07318200428972765, \"log2_bayes_factor\": -3.7723672613995376, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on street_address makes comparison  13.66 times less likely to be a match\", \"column_name\": \"tf_street_address\", \"value_l\": \"76 south main st\", \"value_r\": \"76 south main st\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 4, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 5, \"record_number\": 2}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 3.0085647667960136, \"log2_bayes_factor\": 1.5890754146802368, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison 3.01 times more likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"oh\", \"value_r\": \"oh\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 6, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 7, \"record_number\": 2}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 8.77560873402159, \"log2_bayes_factor\": 3.1334992037185074, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison 8.78 times more likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"akron\", \"value_r\": \"akron\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 8, \"record_number\": 2}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 6.1959102350792365, \"bayes_factor\": 73.30858404657978, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 9, \"record_number\": 2}, {\"column_name\": \"Prior\", \"label_for_charts\": \"Starting match weight (prior)\", \"sql_condition\": null, \"log2_bayes_factor\": -18.684061249539493, \"bayes_factor\": 2.3743083676072958e-06, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 0, \"record_number\": 3}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"column_name\": \"company_name_no_legal\", \"value_l\": \"sempra energy\", \"value_r\": \"sempra generation\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 1, \"record_number\": 3}, {\"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"bayes_factor\": 1.0, \"log2_bayes_factor\": 0.0, \"comparison_vector_value\": 0, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"column_name\": \"tf_company_name_no_legal\", \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 2, \"record_number\": 3}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"column_name\": \"street_address\", \"value_l\": \"488 8th ave\", \"value_r\": \"488 8th ave\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 3, \"record_number\": 3}, {\"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Term freq adjustment on street_address with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.1902732111532919, \"log2_bayes_factor\": -2.3938556381458076, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on street_address makes comparison  5.26 times less likely to be a match\", \"column_name\": \"tf_street_address\", \"value_l\": \"488 8th ave\", \"value_r\": \"488 8th ave\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 4, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"column_name\": \"state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 5, \"record_number\": 3}, {\"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Term freq adjustment on state with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.32361741187049275, \"log2_bayes_factor\": -1.6276388624488192, \"comparison_vector_value\": 1, \"bayes_factor_description\": \"Term frequency adjustment on state makes comparison  3.09 times less likely to be a match\", \"column_name\": \"tf_state\", \"value_l\": \"ca\", \"value_r\": \"ca\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 6, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"column_name\": \"city\", \"value_l\": \"san diego\", \"value_r\": \"san diego\", \"term_frequency_adjustment\": false, \"bar_sort_order\": 7, \"record_number\": 3}, {\"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Term freq adjustment on city with weight {cl.tf_adjustment_weight}\", \"m_probability\": null, \"u_probability\": null, \"bayes_factor\": 0.5427263263178971, \"log2_bayes_factor\": -0.8817032029251628, \"comparison_vector_value\": 2, \"bayes_factor_description\": \"Term frequency adjustment on city makes comparison  1.84 times less likely to be a match\", \"column_name\": \"tf_city\", \"value_l\": \"san diego\", \"value_r\": \"san diego\", \"term_frequency_adjustment\": true, \"bar_sort_order\": 8, \"record_number\": 3}, {\"column_name\": \"Final score\", \"label_for_charts\": \"Final score\", \"sql_condition\": null, \"log2_bayes_factor\": 0.3425051745602402, \"bayes_factor\": 1.2679564323427852, \"comparison_vector_value\": null, \"m_probability\": null, \"u_probability\": null, \"bayes_factor_description\": null, \"value_l\": \"\", \"value_r\": \"\", \"term_frequency_adjustment\": null, \"bar_sort_order\": 9, \"record_number\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 448,
+     "execution_count": 152,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4197,597 +3850,46 @@
    "id": "a2ba43b6-a664-462a-823f-e3f08585bb51",
    "metadata": {},
    "source": [
-    "# Save good predictions"
+    "# Save good predictions\n",
+    "Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 192,
+   "execution_count": 153,
    "id": "92172e2f-39ba-49e3-8312-98597256ca4f",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(\n",
+    "    by=\"match_probability\", ascending=False\n",
+    ").drop_duplicates(\n",
+    "    subset=\"sec_company_id\", keep=\"first\"\n",
+    ").drop_duplicates(\n",
+    "    subset=\"utility_id_eia\", keep=\"first\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "id": "07ca81ae-1b26-4cd3-ade6-75381028028a",
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>match_weight</th>\n",
-       "      <th>match_probability</th>\n",
-       "      <th>source_dataset_l</th>\n",
-       "      <th>source_dataset_r</th>\n",
-       "      <th>record_id_l</th>\n",
-       "      <th>record_id_r</th>\n",
-       "      <th>company_name_l</th>\n",
-       "      <th>company_name_r</th>\n",
-       "      <th>gamma_company_name</th>\n",
-       "      <th>tf_company_name_l</th>\n",
-       "      <th>tf_company_name_r</th>\n",
-       "      <th>bf_company_name</th>\n",
-       "      <th>bf_tf_adj_company_name</th>\n",
-       "      <th>street_address_l</th>\n",
-       "      <th>street_address_r</th>\n",
-       "      <th>gamma_street_address</th>\n",
-       "      <th>tf_street_address_l</th>\n",
-       "      <th>tf_street_address_r</th>\n",
-       "      <th>bf_street_address</th>\n",
-       "      <th>bf_tf_adj_street_address</th>\n",
-       "      <th>zip_code_l</th>\n",
-       "      <th>zip_code_r</th>\n",
-       "      <th>gamma_zip_code</th>\n",
-       "      <th>tf_zip_code_l</th>\n",
-       "      <th>tf_zip_code_r</th>\n",
-       "      <th>bf_zip_code</th>\n",
-       "      <th>bf_tf_adj_zip_code</th>\n",
-       "      <th>city_l</th>\n",
-       "      <th>city_r</th>\n",
-       "      <th>gamma_city</th>\n",
-       "      <th>tf_city_l</th>\n",
-       "      <th>tf_city_r</th>\n",
-       "      <th>bf_city</th>\n",
-       "      <th>bf_tf_adj_city</th>\n",
-       "      <th>company_name_mphone_l</th>\n",
-       "      <th>company_name_mphone_r</th>\n",
-       "      <th>street_address_list_l</th>\n",
-       "      <th>street_address_list_r</th>\n",
-       "      <th>match_key</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>199607</th>\n",
-       "      <td>4.265490</td>\n",
-       "      <td>0.950575</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>20077</td>\n",
-       "      <td>117512</td>\n",
-       "      <td>prt group incorporated</td>\n",
-       "      <td>pratt and whitney power systems</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000010</td>\n",
-       "      <td>0.991220</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>80 lamberton rd</td>\n",
-       "      <td>mail stop 191-13</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.865948</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>06095</td>\n",
-       "      <td>06095</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000191</td>\n",
-       "      <td>0.000191</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>3.403266</td>\n",
-       "      <td>windsor</td>\n",
-       "      <td>windsor</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000279</td>\n",
-       "      <td>0.000279</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>24.882561</td>\n",
-       "      <td>PRT KRP</td>\n",
-       "      <td>PRT ANT HTN PWR SSTMS</td>\n",
-       "      <td>[80, lamberton, rd]</td>\n",
-       "      <td>[mail, stop, 191-13]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12041</th>\n",
-       "      <td>4.277468</td>\n",
-       "      <td>0.950964</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>219453</td>\n",
-       "      <td>113555</td>\n",
-       "      <td>cogentrix energy incorporated</td>\n",
-       "      <td>green country energy limited liability company</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.991220</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>9405 arrowpoint blvd</td>\n",
-       "      <td>9405 arrowpoint blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000534</td>\n",
-       "      <td>0.000534</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.015600</td>\n",
-       "      <td>28273</td>\n",
-       "      <td>28273</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.001256</td>\n",
-       "      <td>0.001256</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>0.516567</td>\n",
-       "      <td>charlotte</td>\n",
-       "      <td>chalotte</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.014155</td>\n",
-       "      <td>0.000022</td>\n",
-       "      <td>79.923487</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>KJNTRKS ENRJ</td>\n",
-       "      <td>KRN KNTR ENRJ</td>\n",
-       "      <td>[9405, arrowpoint, blvd]</td>\n",
-       "      <td>[9405, arrowpoint, blvd]</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12805</th>\n",
-       "      <td>4.277468</td>\n",
-       "      <td>0.950964</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>219453</td>\n",
-       "      <td>115755</td>\n",
-       "      <td>cogentrix energy incorporated</td>\n",
-       "      <td>jackson county power limited liability company</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.991220</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>9405 arrowpoint blvd</td>\n",
-       "      <td>9405 arrowpoint blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000534</td>\n",
-       "      <td>0.000534</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.015600</td>\n",
-       "      <td>28273</td>\n",
-       "      <td>28273</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.001256</td>\n",
-       "      <td>0.001256</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>0.516567</td>\n",
-       "      <td>charlotte</td>\n",
-       "      <td>chaarlotte</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.014155</td>\n",
-       "      <td>0.000011</td>\n",
-       "      <td>79.923487</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>KJNTRKS ENRJ</td>\n",
-       "      <td>JKSN KNT PWR</td>\n",
-       "      <td>[9405, arrowpoint, blvd]</td>\n",
-       "      <td>[9405, arrowpoint, blvd]</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8137</th>\n",
-       "      <td>4.278093</td>\n",
-       "      <td>0.950984</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>64813</td>\n",
-       "      <td>3879</td>\n",
-       "      <td>rand logistics incorporated</td>\n",
-       "      <td>norridgewock river road solar limited liabilit...</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.000019</td>\n",
-       "      <td>0.991220</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>333 washington street</td>\n",
-       "      <td>333 washington street</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001056</td>\n",
-       "      <td>0.001056</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.007888</td>\n",
-       "      <td>07302</td>\n",
-       "      <td>07302</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.002332</td>\n",
-       "      <td>0.002332</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>0.278152</td>\n",
-       "      <td>jersey city</td>\n",
-       "      <td>jersey city</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.002998</td>\n",
-       "      <td>0.002998</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>2.312506</td>\n",
-       "      <td>RNT LJSTKS</td>\n",
-       "      <td>NRJWK RFR RT SLR</td>\n",
-       "      <td>[333, washington, street]</td>\n",
-       "      <td>[333, washington, street]</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>8136</th>\n",
-       "      <td>4.278093</td>\n",
-       "      <td>0.950984</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>64813</td>\n",
-       "      <td>5193</td>\n",
-       "      <td>rand logistics incorporated</td>\n",
-       "      <td>anderson solar farm limited liability company</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.991220</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>333 washington street</td>\n",
-       "      <td>333 washington street</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001056</td>\n",
-       "      <td>0.001056</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.007888</td>\n",
-       "      <td>07302</td>\n",
-       "      <td>07302</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.002332</td>\n",
-       "      <td>0.002332</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>0.278152</td>\n",
-       "      <td>jersey city</td>\n",
-       "      <td>jersey city</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.002998</td>\n",
-       "      <td>0.002998</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>2.312506</td>\n",
-       "      <td>RNT LJSTKS</td>\n",
-       "      <td>ANTRSN SLR FRM</td>\n",
-       "      <td>[333, washington, street]</td>\n",
-       "      <td>[333, washington, street]</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>199278</th>\n",
-       "      <td>27.514584</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>27759</td>\n",
-       "      <td>142183</td>\n",
-       "      <td>diamond brands incorporated</td>\n",
-       "      <td>diamond brands incorporated</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>0.000029</td>\n",
-       "      <td>7612.680596</td>\n",
-       "      <td>0.037986</td>\n",
-       "      <td>1800 cloquet avenue</td>\n",
-       "      <td>1800 cloquet avenue</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>0.000036</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.233998</td>\n",
-       "      <td>55720</td>\n",
-       "      <td>55720</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000078</td>\n",
-       "      <td>0.000078</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>8.265075</td>\n",
-       "      <td>cloquet</td>\n",
-       "      <td>cloquet</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000078</td>\n",
-       "      <td>0.000078</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>88.866289</td>\n",
-       "      <td>TMNT BRNTS</td>\n",
-       "      <td>TMNT BRNTS</td>\n",
-       "      <td>[1800, cloquet, avenue]</td>\n",
-       "      <td>[1800, cloquet, avenue]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>485070</th>\n",
-       "      <td>27.655362</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>50420</td>\n",
-       "      <td>95697</td>\n",
-       "      <td>gulf power company</td>\n",
-       "      <td>gulf power company</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>7612.680596</td>\n",
-       "      <td>0.028490</td>\n",
-       "      <td>one energy place</td>\n",
-       "      <td>one energy place</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.350997</td>\n",
-       "      <td>32520</td>\n",
-       "      <td>32520</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000056</td>\n",
-       "      <td>0.000056</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>11.571104</td>\n",
-       "      <td>pensacola</td>\n",
-       "      <td>pensacola</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000111</td>\n",
-       "      <td>0.000111</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>62.206402</td>\n",
-       "      <td>KLF PWR</td>\n",
-       "      <td>KLF PWR</td>\n",
-       "      <td>[one, energy, place]</td>\n",
-       "      <td>[one, energy, place]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>331565</th>\n",
-       "      <td>27.977290</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>170775</td>\n",
-       "      <td>78563</td>\n",
-       "      <td>berry petroleum company</td>\n",
-       "      <td>berry petroleum company</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000096</td>\n",
-       "      <td>0.000096</td>\n",
-       "      <td>7612.680596</td>\n",
-       "      <td>0.011396</td>\n",
-       "      <td>28700 hovey hills rd</td>\n",
-       "      <td>28700 hovey hills rd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.350997</td>\n",
-       "      <td>93268</td>\n",
-       "      <td>93268</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>14.463881</td>\n",
-       "      <td>taft</td>\n",
-       "      <td>taft</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>155.516006</td>\n",
-       "      <td>BR PTRLM</td>\n",
-       "      <td>BR PTRLM</td>\n",
-       "      <td>[28700, hovey, hills, rd]</td>\n",
-       "      <td>[28700, hovey, hills, rd]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>869341</th>\n",
-       "      <td>28.977290</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>39609</td>\n",
-       "      <td>141382</td>\n",
-       "      <td>eme homer city generation limited partnership</td>\n",
-       "      <td>eme homer city generation limited partnership</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>0.000038</td>\n",
-       "      <td>7612.680596</td>\n",
-       "      <td>0.028490</td>\n",
-       "      <td>1750 power plant road</td>\n",
-       "      <td>1750 power plant road</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.350997</td>\n",
-       "      <td>15748</td>\n",
-       "      <td>15748</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>0.000045</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>14.463881</td>\n",
-       "      <td>homer city</td>\n",
-       "      <td>homer city</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000056</td>\n",
-       "      <td>0.000056</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>124.412805</td>\n",
-       "      <td>EM HMR ST JNRXN</td>\n",
-       "      <td>EM HMR ST JNRXN</td>\n",
-       "      <td>[1750, power, plant, road]</td>\n",
-       "      <td>[1750, power, plant, road]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>73212</th>\n",
-       "      <td>29.544331</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>224681</td>\n",
-       "      <td>50859</td>\n",
-       "      <td>selkirk cogen partners limited partnership</td>\n",
-       "      <td>selkirk cogen partners limited partnership</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>0.000058</td>\n",
-       "      <td>7612.680596</td>\n",
-       "      <td>0.018993</td>\n",
-       "      <td>24 power park drive</td>\n",
-       "      <td>24 power park drive</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>14580.390627</td>\n",
-       "      <td>0.350997</td>\n",
-       "      <td>12158</td>\n",
-       "      <td>12158</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000034</td>\n",
-       "      <td>0.000034</td>\n",
-       "      <td>1148.002189</td>\n",
-       "      <td>19.285174</td>\n",
-       "      <td>selkirk</td>\n",
-       "      <td>selkirk</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000033</td>\n",
-       "      <td>0.000033</td>\n",
-       "      <td>126.999683</td>\n",
-       "      <td>207.354675</td>\n",
-       "      <td>SLKRK KJN PRTNRS</td>\n",
-       "      <td>SLKRK KJN PRTNRS</td>\n",
-       "      <td>[24, power, park, drive]</td>\n",
-       "      <td>[24, power, park, drive]</td>\n",
-       "      <td>0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3014 rows × 39 columns</p>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                                 company_name_l                                     company_name_r  gamma_company_name  tf_company_name_l  tf_company_name_r  bf_company_name  bf_tf_adj_company_name       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address zip_code_l zip_code_r  gamma_zip_code  tf_zip_code_l  tf_zip_code_r  bf_zip_code  bf_tf_adj_zip_code       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l  company_name_mphone_r       street_address_list_l       street_address_list_r match_key\n",
-       "199607      4.265490           0.950575  __splink__input_table_0  __splink__input_table_1        20077       117512                         prt group incorporated                    pratt and whitney power systems                   0           0.000019           0.000010         0.991220                1.000000        80 lamberton rd       mail stop 191-13                     0             0.000036             0.000012           0.865948                  1.000000      06095      06095               1       0.000191       0.000191  1148.002189            3.403266      windsor      windsor           2   0.000279   0.000279  126.999683       24.882561               PRT KRP  PRT ANT HTN PWR SSTMS         [80, lamberton, rd]        [mail, stop, 191-13]         0\n",
-       "12041       4.277468           0.950964  __splink__input_table_0  __splink__input_table_1       219453       113555                  cogentrix energy incorporated     green country energy limited liability company                   0           0.000019           0.000038         0.991220                1.000000   9405 arrowpoint blvd   9405 arrowpoint blvd                     2             0.000534             0.000534       14580.390627                  0.015600      28273      28273               1       0.001256       0.001256  1148.002189            0.516567    charlotte     chalotte           1   0.014155   0.000022   79.923487        1.000000          KJNTRKS ENRJ          KRN KNTR ENRJ    [9405, arrowpoint, blvd]    [9405, arrowpoint, blvd]         1\n",
-       "12805       4.277468           0.950964  __splink__input_table_0  __splink__input_table_1       219453       115755                  cogentrix energy incorporated     jackson county power limited liability company                   0           0.000019           0.000029         0.991220                1.000000   9405 arrowpoint blvd   9405 arrowpoint blvd                     2             0.000534             0.000534       14580.390627                  0.015600      28273      28273               1       0.001256       0.001256  1148.002189            0.516567    charlotte   chaarlotte           1   0.014155   0.000011   79.923487        1.000000          KJNTRKS ENRJ           JKSN KNT PWR    [9405, arrowpoint, blvd]    [9405, arrowpoint, blvd]         1\n",
-       "8137        4.278093           0.950984  __splink__input_table_0  __splink__input_table_1        64813         3879                    rand logistics incorporated  norridgewock river road solar limited liabilit...                   0           0.000029           0.000019         0.991220                1.000000  333 washington street  333 washington street                     2             0.001056             0.001056       14580.390627                  0.007888      07302      07302               1       0.002332       0.002332  1148.002189            0.278152  jersey city  jersey city           2   0.002998   0.002998  126.999683        2.312506            RNT LJSTKS       NRJWK RFR RT SLR   [333, washington, street]   [333, washington, street]         1\n",
-       "8136        4.278093           0.950984  __splink__input_table_0  __splink__input_table_1        64813         5193                    rand logistics incorporated      anderson solar farm limited liability company                   0           0.000029           0.000029         0.991220                1.000000  333 washington street  333 washington street                     2             0.001056             0.001056       14580.390627                  0.007888      07302      07302               1       0.002332       0.002332  1148.002189            0.278152  jersey city  jersey city           2   0.002998   0.002998  126.999683        2.312506            RNT LJSTKS         ANTRSN SLR FRM   [333, washington, street]   [333, washington, street]         1\n",
-       "...              ...                ...                      ...                      ...          ...          ...                                            ...                                                ...                 ...                ...                ...              ...                     ...                    ...                    ...                   ...                  ...                  ...                ...                       ...        ...        ...             ...            ...            ...          ...                 ...          ...          ...         ...        ...        ...         ...             ...                   ...                    ...                         ...                         ...       ...\n",
-       "199278     27.514584           1.000000  __splink__input_table_0  __splink__input_table_1        27759       142183                    diamond brands incorporated                        diamond brands incorporated                   2           0.000029           0.000029      7612.680596                0.037986    1800 cloquet avenue    1800 cloquet avenue                     2             0.000036             0.000036       14580.390627                  0.233998      55720      55720               1       0.000078       0.000078  1148.002189            8.265075      cloquet      cloquet           2   0.000078   0.000078  126.999683       88.866289            TMNT BRNTS             TMNT BRNTS     [1800, cloquet, avenue]     [1800, cloquet, avenue]         0\n",
-       "485070     27.655362           1.000000  __splink__input_table_0  __splink__input_table_1        50420        95697                             gulf power company                                 gulf power company                   2           0.000038           0.000038      7612.680596                0.028490       one energy place       one energy place                     2             0.000024             0.000024       14580.390627                  0.350997      32520      32520               1       0.000056       0.000056  1148.002189           11.571104    pensacola    pensacola           2   0.000111   0.000111  126.999683       62.206402               KLF PWR                KLF PWR        [one, energy, place]        [one, energy, place]         0\n",
-       "331565     27.977290           1.000000  __splink__input_table_0  __splink__input_table_1       170775        78563                        berry petroleum company                            berry petroleum company                   2           0.000096           0.000096      7612.680596                0.011396   28700 hovey hills rd   28700 hovey hills rd                     2             0.000024             0.000024       14580.390627                  0.350997      93268      93268               1       0.000045       0.000045  1148.002189           14.463881         taft         taft           2   0.000045   0.000045  126.999683      155.516006              BR PTRLM               BR PTRLM   [28700, hovey, hills, rd]   [28700, hovey, hills, rd]         0\n",
-       "869341     28.977290           1.000000  __splink__input_table_0  __splink__input_table_1        39609       141382  eme homer city generation limited partnership      eme homer city generation limited partnership                   2           0.000038           0.000038      7612.680596                0.028490  1750 power plant road  1750 power plant road                     2             0.000024             0.000024       14580.390627                  0.350997      15748      15748               1       0.000045       0.000045  1148.002189           14.463881   homer city   homer city           2   0.000056   0.000056  126.999683      124.412805       EM HMR ST JNRXN        EM HMR ST JNRXN  [1750, power, plant, road]  [1750, power, plant, road]         0\n",
-       "73212      29.544331           1.000000  __splink__input_table_0  __splink__input_table_1       224681        50859     selkirk cogen partners limited partnership         selkirk cogen partners limited partnership                   2           0.000058           0.000058      7612.680596                0.018993    24 power park drive    24 power park drive                     2             0.000024             0.000024       14580.390627                  0.350997      12158      12158               1       0.000034       0.000034  1148.002189           19.285174      selkirk      selkirk           2   0.000033   0.000033  126.999683      207.354675      SLKRK KJN PRTNRS       SLKRK KJN PRTNRS    [24, power, park, drive]    [24, power, park, drive]         0\n",
-       "\n",
-       "[3014 rows x 39 columns]"
+       "525"
       ]
      },
-     "execution_count": 192,
+     "execution_count": 154,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ad4d3859-81d1-4fa8-98cc-ff7c9fd038f6",
-   "metadata": {},
-   "source": [
-    "# Match to Ex. 21 subsidiaries"
+    "len(one_to_one_preds)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "d1c56b09-80c7-4bfe-b1ec-c0220cadafbf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# match EIA records that don't have a prediction to EIA subsidiaries\n",
-    "# can reuse code from SEC module?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5599b7a-ea9a-40fd-9ce1-cb79a8d4dc35",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py
index 924b6a4..217fb9b 100644
--- a/src/mozilla_sec_eia/library/record_linkage_utils.py
+++ b/src/mozilla_sec_eia/library/record_linkage_utils.py
@@ -1,6 +1,8 @@
 """Utility functions for cleaning strings during modeling preprocessing steps."""
 
+import json
 from enum import StrEnum
+from importlib import resources
 
 import jellyfish
 import pandas as pd
@@ -91,6 +93,19 @@ def handle_invalid_names(
     return df
 
 
+def flatten_companies_across_time(
+    df: pd.DataFrame, key_cols: list[str], date_col: str = "report_date"
+) -> pd.DataFrame:
+    """Keep only the most recent record for each group of `key_cols`.
+
+    Dataframe must have all of `key_cols` and `date_col`.
+    """
+    df = (
+        df.sort_values(by=date_col, ascending=False).groupby(key_cols).first()
+    ).reset_index()
+    return df
+
+
 # TODO: this is in PUDL, deduplicate
 def get_metaphone_col(col: pd.Series) -> pd.Series:
     """Get the metaphones of the strings in a column."""
@@ -133,3 +148,23 @@ def fill_street_address_nulls(
         df[secondary_address_col],
     )
     return df
+
+
+def expand_street_name_abbreviations(col: pd.Series) -> pd.Series:
+    """Standardize street address suffixes, like street to st.
+
+    Expects lower case strings in column.
+    """
+    # remove punctuation from column first
+    col = col.str.replace(r"[^\w\s]", "", regex=True)
+
+    json_source = (
+        resources.files("mozilla_sec_eia.package_data")
+        / "street_suffix_abbreviations.json"
+    )
+    with json_source.open() as f:
+        address_expansions = json.load(f)
+    for standard_abbr, suffix_list in address_expansions.items():
+        pattern = r"\b(" + "|".join(suffix_list) + r")\b"
+        col = col.str.replace(pattern, standard_abbr, regex=True)
+    return col
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
deleted file mode 100644
index 3caa182..0000000
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py
+++ /dev/null
@@ -1,167 +0,0 @@
-"""Preprocessing for EIA and SEC input data before record linkage."""
-
-import jellyfish
-import numpy as np
-import pandas as pd
-
-from pudl.analysis.record_linkage import name_cleaner
-
-EIA_COL_MAP = {
-    "utility_name_eia": "company_name",  # TODO: should be linking to owner or operator name?
-    "address_2": "street_address_2",
-}
-
-SEC_COL_MAP = {
-    "company_conformed_name": "company_name",
-    "street_1": "street_address",
-    "street_2": "street_address_2",
-    "zip": "zip_code",
-    "business_phone": "phone_number",
-    "date_filed": "report_date",
-}
-
-SHARED_COLS = [
-    "report_date",
-    "report_year",
-    "company_name",
-    "street_address",
-    "street_address_2",
-    "city",
-    "state",  # could use state of incorporation from SEC
-    "zip_code",
-    "phone_number",
-]
-
-STR_COLS = [
-    "company_name",
-    "street_address",
-    "street_address_2",
-    "city",
-    "state",
-    "zip_code",
-]
-
-INVALID_NAMES = [
-    "llc",
-    "limited liability company",
-    "limited",
-    "ltd",
-    "iiii",
-    "inc",
-    "incorporated",
-    "partnership",
-    "i",
-    "name",
-    "company",
-    "&",
-    "",
-]
-
-
-company_name_cleaner = name_cleaner.CompanyNameCleaner(
-    cleaning_rules_list=[
-        "remove_word_the_from_the_end",
-        "remove_word_the_from_the_beginning",
-        "replace_ampersand_by_AND",
-        "replace_hyphen_by_space",
-        "replace_underscore_by_space",
-        "remove_text_punctuation",
-        "remove_parentheses",
-        "remove_brackets",
-        "remove_curly_brackets",
-        "enforce_single_space_between_words",
-    ]
-)
-
-legal_term_remover = name_cleaner.CompanyNameCleaner(
-    cleaning_rules_list=[], handle_legal_terms=2
-)
-
-
-# TODO: this is in PUDL, pull out into helper function
-def _get_metaphone(row, col_name):
-    if pd.isnull(row[col_name]):
-        return None
-    return jellyfish.metaphone(row[col_name])
-
-
-# TODO: delete
-def _clean_company_name(df):
-    df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning(
-        df[["company_name"]]
-    ).str.strip()
-    df = df[df["company_name_clean"] != ""]
-    df = df.rename(columns={"company_name": "company_name_raw"}).rename(
-        columns={"company_name_clean": "company_name"}
-    )
-    df.loc[:, "company_name_no_legal"] = legal_term_remover.apply_name_cleaning(
-        df[["company_name"]]
-    )
-    return df
-
-
-# TODO: delete
-def clean_sec_df(df):
-    """Shared cleaning for SEC 10K and Ex. 21 dataframes.
-
-    Arguments:
-        df: Ex. 21 or SEC 10K basic info dataframe with columns
-        company_name, loc_of_incorporation, and report_year.
-    """
-    df = _clean_company_name(df)
-    df.loc[:, "company_name_mphone"] = df.apply(
-        _get_metaphone, axis=1, args=("company_name_no_legal",)
-    )
-    df = df[
-        (~df["company_name"].isin(INVALID_NAMES))
-        & (~df["company_name_raw"].isin(INVALID_NAMES))
-    ]
-    df = df.fillna(np.nan)
-
-    return df
-
-
-# TODO: delete
-def prepare_sec10k_basic_info_df(sec_df):
-    """Preprocess SEC 10k basic information dataframe for record linkage."""
-    sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index()
-    sec_df = clean_sec_df(sec_df)
-    sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
-    # TODO: does this actually drop anything?
-    sec_df = sec_df.drop_duplicates(
-        subset=[
-            "central_index_key",
-            "report_year",
-            "company_name",
-            "standard_industrial_classification",
-            "city",
-            "state",
-            "street_address",
-            "zip_code",
-        ]
-    )
-    return sec_df
-
-
-# TODO: delete
-def prepare_ex21_df(ex21_df):
-    """Preprocess Ex. 21 extracted dataframe for record linkage."""
-    ex21_df = clean_sec_df(ex21_df)
-    return ex21_df
-
-
-# TODO: delete
-def prepare_eia_df(eia_df):
-    """Preprocess EIA utility dataframe for record linkage."""
-    eia_df = eia_df.rename(columns=EIA_COL_MAP)
-    eia_df.loc[:, "report_year"] = (
-        eia_df["report_date"].astype("datetime64[ns]").dt.year
-    )
-    eia_df = eia_df.fillna(np.nan)
-    eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
-    eia_df = _clean_company_name(eia_df)
-    eia_df.loc[:, "company_name_mphone"] = eia_df.apply(
-        _get_metaphone, axis=1, args=("company_name_no_legal",)
-    )
-    eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id")
-    return eia_df
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
index 3a5edae..c8ccfd9 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py
@@ -32,8 +32,8 @@
 BLOCKING_RULES = [
     "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)",
     "l.street_address = r.street_address",
-    "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city",
-    "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2",
+    "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city",
+    # "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2",
 ]
 
 company_name_comparison = cl.NameComparison(
@@ -44,7 +44,6 @@
 address_comparison = cl.LevenshteinAtThresholds(
     "street_address", distance_threshold_or_thresholds=[1]
 ).configure(term_frequency_adjustments=True)
-print(address_comparison.get_comparison("duckdb").human_readable_description)
 
 state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True)
 city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9])
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
index 4da5c1b..c832cf0 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
@@ -5,7 +5,9 @@
 from dagster import AssetOut, multi_asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
+    expand_street_name_abbreviations,
     fill_street_address_nulls,
+    flatten_companies_across_time,
     transform_company_name,
 )
 from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS
@@ -87,19 +89,27 @@ def eia_rl_input_table():
         "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet"
     )
     eia861_df = harvest_eia861_utilities()
-    eia_df = pd.concat([raw_eia_df, eia861_df])
-    eia_df = eia_df.drop_duplicates(
-        subset=["utility_id_eia", "report_date"], keep="first"
-    ).dropna(subset="utility_name_eia")
-    eia_df = eia_df.rename(columns=EIA_COL_MAP)
-    eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]")
-    eia_df.loc[:, "report_year"] = eia_df["report_date"].dt.year
-    eia_df = transform_company_name(eia_df)
-    eia_df.loc[:, "zip_code"] = eia_df["zip_code"].str[:5]
-    eia_df = fill_street_address_nulls(eia_df)
+    eia_df = (
+        pd.concat([raw_eia_df, eia861_df])
+        .dropna(subset=["utility_name_eia"])
+        .rename(columns=EIA_COL_MAP)
+        .assign(
+            report_date=lambda df: df["report_date"].astype("datetime64[ns]"),
+            report_year=lambda df: df["report_date"].dt.year,
+            zip_code=lambda df: df["zip_code"].str[:5],
+        )
+        .pipe(transform_company_name)
+        .pipe(fill_street_address_nulls)
+        .pipe(lambda df: df.fillna(np.nan))
+        .reset_index(drop=True)
+    )
     eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower())
-    eia_df = eia_df.fillna(np.nan)
-    eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id")
+    eia_df["street_address"] = expand_street_name_abbreviations(
+        eia_df["street_address"]
+    )
+    eia_df = flatten_companies_across_time(
+        df=eia_df, key_cols=["company_name", "street_address"]
+    ).reset_index(names="record_id")
 
     return eia_df
 
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index ff88151..0a51f23 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -10,7 +10,9 @@
 from dagster import AssetIn, asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
+    expand_street_name_abbreviations,
     fill_street_address_nulls,
+    flatten_companies_across_time,
     transform_company_name,
 )
 from mozilla_sec_eia.models.sec10k.utils.cloud import (
@@ -73,24 +75,6 @@ def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr
     return sec_df
 
 
-def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame:
-    """Keep only the most recent record for each unique SEC company.
-
-    Note that this drops old records for companies that have changed
-    names or addresses across time. Also, we group by sec_company_id not
-    CIK, so filer companies and subsidiary companies are unique in the
-    output dataframe.
-    TODO: create an asset that tracks name and address chnages across
-    time.
-    """
-    sec_df = (
-        sec_df.sort_values(by="report_year", ascending=False)
-        .groupby("sec_company_id")
-        .first()
-    ).reset_index()
-    return sec_df
-
-
 def get_sec_state_code_dict() -> dict[str, str]:
     """Create a dictionary mapping state codes to their names.
 
@@ -236,6 +220,8 @@ def match_ex21_subsidiaries_to_filer_company(
     ex21_with_cik_df = ex21_with_cik_df.rename(
         columns={"subsidiary_cik": "central_index_key"}
     )
+    ex21_with_cik_df = ex21_with_cik_df.drop_duplicates()
+
     return ex21_with_cik_df
 
 
@@ -281,7 +267,9 @@ def transformed_ex21_subsidiary_table(
     # add an sec_company_id, ultimately this ID become the subsidiary's CIK
     # if the subsidiary is matched to an SEC filer
     ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df)
-    ex21_df = _flatten_sec_companies_across_time(ex21_df)
+    ex21_df = flatten_companies_across_time(
+        df=ex21_df, key_cols=["sec_company_id"], date_col="report_year"
+    )
     ex21_df = ex21_df.fillna(np.nan)
 
     return ex21_df
@@ -295,35 +283,45 @@ def transform_basic10k_table(
         values="value", index="filename", columns="key", aggfunc="first"
     )
     basic_10k_df.columns.name = None
-    # TODO: chain these function calls together
-    basic_10k_df = basic_10k_df.reset_index()
-    basic_10k_df = _remove_weird_sec_cols(basic_10k_df)
-    basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata)
-    basic_10k_df = basic_10k_df.rename(columns=SEC_COL_MAP)
-    # add a location of incorporation to better match it to Ex. 21 subsidiaries
-    basic_10k_df = clean_location_of_inc(basic_10k_df)
-    basic_10k_df = transform_company_name(basic_10k_df)
-    basic_10k_df.loc[:, "zip_code"] = basic_10k_df["zip_code"].str[:5]
-    basic_10k_df = fill_street_address_nulls(basic_10k_df)
-    basic_10k_df.loc[:, "files_10k"] = True
-    basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"]
+    basic_10k_df = (
+        basic_10k_df.reset_index()
+        .pipe(_remove_weird_sec_cols)
+        .pipe(_add_report_year_to_sec, sec10k_filing_metadata)
+        .rename(columns=SEC_COL_MAP)
+        .pipe(clean_location_of_inc)
+        .pipe(transform_company_name)
+        .assign(
+            zip_code=lambda df: df["zip_code"].str[:5],
+            files_10k=True,
+            sec_company_id=lambda df: df["central_index_key"],
+        )
+        .pipe(fill_street_address_nulls)
+    )
     basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply(
         lambda x: x.str.strip().str.lower()
     )
+    basic_10k_df["street_address"] = expand_street_name_abbreviations(
+        basic_10k_df["street_address"]
+    )
+    # flatten across time on unique company name and address pair
+    basic_10k_df = flatten_companies_across_time(
+        df=basic_10k_df, key_cols=["company_name", "street_address"]
+    )
+
     return basic_10k_df
 
 
 @asset(
     ins={
         "basic_10k_dfs": AssetIn("basic_10k_company_info"),
-        "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
+        # "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
         "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
         # specify an io_manager_key?
     },
 )
-def core_sec_10k__parents_and_subsidiaries(
+def core_sec_10k__filers(
     basic_10k_dfs: dict[str, pd.DataFrame],
-    clean_ex21_df: pd.DataFrame,
+    # clean_ex21_df: pd.DataFrame,
     sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
 ) -> pd.DataFrame:
     """Asset for creating an SEC 10K output table.
@@ -336,10 +334,34 @@ def core_sec_10k__parents_and_subsidiaries(
     basic_10k_df = pd.concat(basic_10k_dfs.values())
     sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
     basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata)
+    # exclude Ex. 21 subs and just match to filers
+    # once the match has been conducted, add back in the Ex. 21 subs
+    out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id")
+    # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia
+    return out_df
+
+
+@asset(
+    ins={
+        "sec10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
+        "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
+    },
+)
+def out_sec_10k__parents_and_subsidiaries(
+    sec10k_filers_matched_df: pd.DataFrame,
+    clean_ex21_df: pd.DataFrame,
+) -> pd.DataFrame:
+    """Asset for creating an SEC 10K output table.
+
+    Flatten the table across time to only keep the most recent record
+    for each CIK. Add in Ex. 21 subsidiaries and link them to already present
+    filing companies. Create an sec_company_id for subsidiaries that aren't linked
+    to a CIK.
+    """
     ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
-        basic10k_df=basic_10k_df, ex21_df=clean_ex21_df
+        basic10k_df=sec10k_filers_matched_df, ex21_df=clean_ex21_df
     )
-    basic_10k_df = basic_10k_df.merge(
+    sec10k_filers_matched_df = sec10k_filers_matched_df.merge(
         ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]],
         how="left",
         on="central_index_key",
@@ -349,16 +371,13 @@ def core_sec_10k__parents_and_subsidiaries(
         ex21_df_with_cik["central_index_key"].isnull()
     ]
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
-    out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df])
-    out_df = out_df.fillna(np.nan)
-    # this drops records for earlier company names and addresses
-    # that have since changed, so we lose some information
-    out_df = _flatten_sec_companies_across_time(out_df)
-
+    out_df = pd.concat([sec10k_filers_matched_df, ex21_non_filing_subs_df])
+    # TODO: match the EIA utilities to the Ex. 21 subs?
     return out_df
 
 
 production_assets = [
-    core_sec_10k__parents_and_subsidiaries,
+    core_sec_10k__filers,
     transformed_ex21_subsidiary_table,
+    out_sec_10k__parents_and_subsidiaries,
 ]
diff --git a/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json
new file mode 100644
index 0000000..e305113
--- /dev/null
+++ b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json
@@ -0,0 +1,203 @@
+{
+    "aly": ["alley", "allee", "ally"],
+    "anx": ["anex", "annex", "annx"],
+    "arc": ["arcade"],
+    "ave": ["avenue", "av", "aven", "avenu", "avn", "avnue"],
+    "byu": ["bayou", "bayoo"],
+    "bch": ["beach"],
+    "bnd": ["bend"],
+    "blf": ["bluff", "bluf"],
+    "blfs": ["bluffs"],
+    "btm": ["bottom", "bot", "bottm"],
+    "blvd": ["boulevard", "boul", "boulv"],
+    "br": ["branch", "brnch"],
+    "brg": ["bridge", "brdge"],
+    "brk": ["brook"],
+    "brks": ["brooks"],
+    "bg": ["burg"],
+    "bgs": ["burgs"],
+    "byp": ["bypass", "bypa", "bypas", "byps"],
+    "cp": ["camp", "cmp"],
+    "cyn": ["canyon", "canyn", "cnyn"],
+    "cpe": ["cape"],
+    "cswy": ["causeway", "causwa"],
+    "ctr": ["center", "cen", "cent", "centr", "centre", "cnter", "cntr"],
+    "ctrs": ["centers"],
+    "cir": ["circle", "circ", "circl", "crcl", "crcle"],
+    "cirs": ["circles"],
+    "clf": ["cliff"],
+    "clfs": ["cliffs"],
+    "clb": ["club"],
+    "cmn": ["common"],
+    "cmns": ["commons"],
+    "cor": ["corner"],
+    "cors": ["corners"],
+    "crse": ["course"],
+    "ct": ["court"],
+    "cts": ["courts"],
+    "cv": ["cove"],
+    "cvs": ["coves"],
+    "crk": ["creek"],
+    "cres": ["crescent", "crsent", "crsnt"],
+    "crst": ["crest"],
+    "xing": ["crossing", "crssng"],
+    "xrd": ["crossroad"],
+    "xrds": ["crossroads"],
+    "curv": ["curve"],
+    "dl": ["dale"],
+    "dm": ["dam"],
+    "dv": ["divide", "div", "dvd"],
+    "dr": ["drive", "driv", "drv"],
+    "drs": ["drives"],
+    "est": ["estate"],
+    "ests": ["estates"],
+    "expy": ["expressway", "exp", "expr", "express", "expw"],
+    "ext": ["extension", "extn", "extnsn"],
+    "exts": ["extensions"],
+    "fls": ["falls"],
+    "fry": ["ferry", "frry"],
+    "fld": ["field"],
+    "flds": ["fields"],
+    "flt": ["flat"],
+    "flts": ["flats"],
+    "frd": ["ford"],
+    "frds": ["fords"],
+    "frst": ["forest", "forests"],
+    "frg": ["forge", "forg"],
+    "frgs": ["forges"],
+    "frk": ["fork"],
+    "frks": ["forks"],
+    "ft": ["fort", "frt"],
+    "fwy": ["freeway", "freewy", "frway", "frwy"],
+    "gdn": ["garden", "gardn", "grden", "grdn"],
+    "gdns": ["gardens", "grdns"],
+    "gtwy": ["gateway", "gatewy", "gatway", "gtway"],
+    "gln": ["glen"],
+    "glns": ["glens"],
+    "grn": ["green"],
+    "grns": ["greens"],
+    "grv": ["grove", "grov"],
+    "grvs": ["groves"],
+    "hbr": ["harbor", "harb", "harbr", "hrbor"],
+    "hbrs": ["harbors"],
+    "hvn": ["haven"],
+    "hts": ["heights", "ht"],
+    "hwy": ["highway", "highwy", "hiway", "hiwy", "hway"],
+    "hl": ["hill"],
+    "hls": ["hills"],
+    "holw": ["hollow", "hllw", "hollows", "holws"],
+    "inlt": ["inlet"],
+    "is": ["island", "islnd"],
+    "iss": ["islands", "islnds"],
+    "isle": ["isles"],
+    "jct": ["junction", "jction", "jctn", "junctn", "juncton"],
+    "jcts": ["junctions", "jctns"],
+    "ky": ["key"],
+    "kys": ["keys"],
+    "knl": ["knoll", "knol"],
+    "knls": ["knolls"],
+    "lk": ["lake"],
+    "lks": ["lakes"],
+    "land": ["land"],
+    "lndg": ["landing", "lndng"],
+    "ln": ["lane"],
+    "lgt": ["light"],
+    "lgts": ["lights"],
+    "lf": ["loaf"],
+    "lck": ["lock"],
+    "lcks": ["locks"],
+    "ldg": ["lodge", "ldge", "lodg"],
+    "loop": ["loops"],
+    "mall": ["mall"],
+    "mnr": ["manor"],
+    "mnrs": ["manors"],
+    "mdw": ["meadow"],
+    "mdws": ["meadows", "mdw", "medows"],
+    "mews": ["mews"],
+    "ml": ["mill"],
+    "mls": ["mills"],
+    "msn": ["mission", "missn", "mssn"],
+    "mtwy": ["motorway"],
+    "mt": ["mount", "mnt"],
+    "mtn": ["mountain", "mntain", "mntn", "mountin", "mtin"],
+    "mtns": ["mountains", "mntns"],
+    "nck": ["neck"],
+    "orch": ["orchard", "orchrd"],
+    "oval": ["ovl"],
+    "opas": ["overpass"],
+    "park": ["parks"],
+    "pkwy": ["parkway", "parkwy", "pkway", "pky", "parkways", "pkwys"],
+    "pass": ["pass"],
+    "psge": ["passage"],
+    "path": ["paths"],
+    "pike": ["pikes"],
+    "pne": ["pine"],
+    "pnes": ["pines"],
+    "pl": ["place"],
+    "pln": ["plain"],
+    "plns": ["plains"],
+    "plz": ["plaza", "plza"],
+    "pt": ["point"],
+    "pts": ["points"],
+    "prt": ["port"],
+    "prts": ["ports"],
+    "pr": ["prairie", "prr"],
+    "radl": ["radial", "rad", "radiel"],
+    "ramp": ["ramp"],
+    "rnch": ["ranch", "ranches", "rnchs"],
+    "rpd": ["rapid"],
+    "rpds": ["rapids"],
+    "rst": ["rest"],
+    "rdg": ["ridge", "rdge"],
+    "rdgs": ["ridges"],
+    "riv": ["river", "rvr", "rivr"],
+    "rd": ["road"],
+    "rds": ["roads"],
+    "rte": ["route"],
+    "row": ["row"],
+    "rue": ["rue"],
+    "run": ["run"],
+    "shl": ["shoal"],
+    "shls": ["shoals"],
+    "shr": ["shore", "shoar"],
+    "shrs": ["shores", "shoars"],
+    "skwy": ["skyway"],
+    "spg": ["spring", "spng", "sprng"],
+    "spgs": ["springs", "spngs", "sprngs"],
+    "spur": ["spurs"],
+    "sq": ["square", "sqr", "sqre", "squ"],
+    "sqs": ["squares", "sqrs"],
+    "sta": ["station", "statn", "stn"],
+    "stra": ["stravenue", "strav", "straven", "stravn", "strvn", "strvnue"],
+    "strm": ["stream", "streme"],
+    "st": ["street", "strt", "str"],
+    "sts": ["streets"],
+    "smt": ["summit", "sumit", "sumitt"],
+    "ter": ["terrace", "terr"],
+    "trwy": ["throughway"],
+    "trce": ["trace", "traces"],
+    "trak": ["track", "tracks", "trk", "trks"],
+    "trfy": ["trafficway"],
+    "trl": ["trail", "trails", "trls"],
+    "trlr": ["trailer", "trlrs"],
+    "tunl": ["tunnel", "tunel", "tunls", "tunnels", "tunnl"],
+    "tpke": ["turnpike", "trnpk", "turnpk"],
+    "upas": ["underpass"],
+    "un": ["union"],
+    "uns": ["unions"],
+    "vly": ["valley", "vally", "vlly"],
+    "vlys": ["valleys"],
+    "via": ["viaduct", "vdct", "viadct"],
+    "vw": ["view"],
+    "vws": ["views"],
+    "vlg": ["village", "vill", "villag", "villg", "villiage"],
+    "vlgs": ["villages"],
+    "vl": ["ville"],
+    "vis": ["vista", "vist", "vst", "vsta"],
+    "walk": ["walks"],
+    "wall": ["wall"],
+    "way": ["wy"],
+    "ways": ["ways"],
+    "wl": ["well"],
+    "wls": ["wells"]
+}

From f4cceb7d983a9e17a2b16581aa150f5698672c79 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Mon, 16 Dec 2024 22:13:16 -0800
Subject: [PATCH 153/161] clean up new structure of sec assets

---
 .../transform_sec_input.py                    | 32 +++++++------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index 0a51f23..cf21059 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -261,7 +261,6 @@ def transformed_ex21_subsidiary_table(
     ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata)
     ex21_df = ex21_df.rename(columns=EX21_COL_MAP)
     ex21_df = clean_location_of_inc(ex21_df)
-    # TODO: what to do with the clean company name?
     ex21_df = transform_company_name(ex21_df)
     ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata)
     # add an sec_company_id, ultimately this ID become the subsidiary's CIK
@@ -314,54 +313,48 @@ def transform_basic10k_table(
 @asset(
     ins={
         "basic_10k_dfs": AssetIn("basic_10k_company_info"),
-        # "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
         "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"),
-        # specify an io_manager_key?
     },
 )
 def core_sec_10k__filers(
     basic_10k_dfs: dict[str, pd.DataFrame],
-    # clean_ex21_df: pd.DataFrame,
     sec10k_filing_metadata_dfs: dict[str, pd.DataFrame],
 ) -> pd.DataFrame:
-    """Asset for creating an SEC 10K output table.
+    """Asset for creating a cleaned basic 10k table with EIA utility matched.
 
     Flatten the table across time to only keep the most recent record
-    for each CIK. Add in Ex. 21 subsidiaries and link them to already present
-    filing companies. Create an sec_company_id for subsidiaries that aren't linked
-    to a CIK.
+    for each unique company name and address pair. Clean table and link filers
+    to EIA utilities.
     """
     basic_10k_df = pd.concat(basic_10k_dfs.values())
     sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values())
     basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata)
-    # exclude Ex. 21 subs and just match to filers
-    # once the match has been conducted, add back in the Ex. 21 subs
     out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id")
+    # match EIA utilities to filers
     # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia
     return out_df
 
 
 @asset(
     ins={
-        "sec10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
+        "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
         "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
     },
 )
 def out_sec_10k__parents_and_subsidiaries(
-    sec10k_filers_matched_df: pd.DataFrame,
+    sec_10k_filers_matched_df: pd.DataFrame,
     clean_ex21_df: pd.DataFrame,
 ) -> pd.DataFrame:
     """Asset for creating an SEC 10K output table.
 
-    Flatten the table across time to only keep the most recent record
-    for each CIK. Add in Ex. 21 subsidiaries and link them to already present
-    filing companies. Create an sec_company_id for subsidiaries that aren't linked
-    to a CIK.
+    Add in Ex. 21 subsidiaries and link them to already present
+    filing companies. Create an sec_company_id for subsidiaries
+    that aren't linked to a CIK.
     """
     ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company(
-        basic10k_df=sec10k_filers_matched_df, ex21_df=clean_ex21_df
+        basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df
     )
-    sec10k_filers_matched_df = sec10k_filers_matched_df.merge(
+    sec_10k_filers_matched_df = sec_10k_filers_matched_df.merge(
         ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]],
         how="left",
         on="central_index_key",
@@ -371,8 +364,7 @@ def out_sec_10k__parents_and_subsidiaries(
         ex21_df_with_cik["central_index_key"].isnull()
     ]
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
-    out_df = pd.concat([sec10k_filers_matched_df, ex21_non_filing_subs_df])
-    # TODO: match the EIA utilities to the Ex. 21 subs?
+    out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df])
     return out_df
 
 

From fa9e52e527697276ce6fa9c22a775a0484ebd166 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 18 Dec 2024 15:28:56 -0800
Subject: [PATCH 154/161] add in final match between ex 21 subs and eia
 utilities

---
 notebooks/18-kl-splink-sec-eia.ipynb          |  17 +--
 .../20-kl-validate-sec-output-table.ipynb     | 114 ++++++++++++++++++
 .../transform_sec_input.py                    |  17 +++
 3 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 8de5812..8299b9f 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -13,15 +13,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "1107fe42-197c-4fea-9c48-06d08699af0b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json\n",
-    "import os\n",
-    "from pathlib import Path\n",
-    "\n",
     "import pandas as pd\n",
     "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n",
     "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n",
@@ -29,7 +25,6 @@
     "import splink.comparison_library as cl\n",
     "import splink.comparison_level_library as cll\n",
     "from splink.exploratory import completeness_chart, profile_columns\n",
-    "from upath import UPath\n",
     "\n",
     "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n",
     "    BLOCKING_RULES,\n",
@@ -61,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb",
    "metadata": {},
    "outputs": [],
@@ -71,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a",
    "metadata": {},
    "outputs": [
@@ -247,7 +242,7 @@
        "2          2  1001 ebenezer church solar limited liability c...       176 ebenezer church rd           63186           8567.0  1001 ebenezer church solar, llc  2020-01-01     state road    nc    28676                  True                     None                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2020  1001 ebenezer church solar       EBNSR XRX SLR"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -258,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3",
    "metadata": {},
    "outputs": [
@@ -268,7 +263,7 @@
        "20821"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb
index 2b28fb9..d6045f9 100644
--- a/notebooks/20-kl-validate-sec-output-table.ipynb
+++ b/notebooks/20-kl-validate-sec-output-table.ipynb
@@ -22,6 +22,120 @@
     "from upath import UPath"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "511b2c77-ebd2-43b0-8e45-1d1c76fb321d",
+   "metadata": {},
+   "source": [
+    "### EIA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4907820f-2552-4a3b-866a-30c3181af91b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f488f86-4b34-4a94-985f-588f991ba86b",
+   "metadata": {},
+   "source": [
+    "### Ex. 21"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c1795acc-8005-4b6d-be4d-27c722b634f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "304d929b-ce6c-4508-b511-475f287a6b37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged_df = ex21_df.merge(\n",
+    "        eia_df.drop_duplicates(subset=\"company_name\")[[\"company_name\", \"utility_id_eia\"]], how=\"left\", on=\"company_name\", suffixes=(\"_ex21\", \"_eia\")\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d315f8d5-7166-4161-bc4e-79c45ed3ad59",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1055987, 20821)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(ex21_df), len(eia_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "3aae6d2c-a941-478e-8178-84cf1321e0b3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "utility_id_eia\n",
+       "True     1050887\n",
+       "False       5100\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_df.utility_id_eia.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "6aba0ae8-a8ee-47ef-8eb9-a0ef9f283b51",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1675"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(merged_df.utility_id_eia.unique())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8d178634-b494-4769-93e3-c0213e4a0326",
diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index cf21059..a825e2b 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -339,11 +339,13 @@ def core_sec_10k__filers(
     ins={
         "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"),
         "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"),
+        "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"),
     },
 )
 def out_sec_10k__parents_and_subsidiaries(
     sec_10k_filers_matched_df: pd.DataFrame,
     clean_ex21_df: pd.DataFrame,
+    clean_eia_df: pd.DataFrame,
 ) -> pd.DataFrame:
     """Asset for creating an SEC 10K output table.
 
@@ -364,6 +366,21 @@ def out_sec_10k__parents_and_subsidiaries(
         ex21_df_with_cik["central_index_key"].isnull()
     ]
     ex21_non_filing_subs_df.loc[:, "files_10k"] = False
+    # the last step is to take the EIA utilities that haven't been matched
+    # to a filer company, and merge them by company name onto the Ex. 21 subs
+    unmatched_eia_df = clean_eia_df[
+        ~clean_eia_df["utility_id_eia"].isin(
+            sec_10k_filers_matched_df.utility_id_eia.unique()
+        )
+    ].drop_duplicates(subset="company_name")
+    ex21_non_filing_subs_df = ex21_non_filing_subs_df.merge(
+        unmatched_eia_df[["utility_id_eia", "company_name"]],
+        how="left",
+        on="company_name",
+    )
+    logger.info(
+        f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}"
+    )
     out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df])
     return out_df
 

From 599ae877f26b9357a542aaa28ddedd58e9291d6a Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 18 Dec 2024 15:29:53 -0800
Subject: [PATCH 155/161] remove sec output table module

---
 .../models/sec_eia_record_linkage/sec_output_table.py  | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py
deleted file mode 100644
index 7f974ad..0000000
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""Module for creating the SEC company output table which connects to EIA company data."""
-
-
-# the input to this method is "core_sec_10k__parents_and_subsidiaries"
-def sec_output_table():
-    """Connect SEC to EIA and format an output table."""
-    # run record linkage to connect SEC to EIA?
-    # add a utility_id_eia column onto the core table
-    # drop the following columns: company_name_no_legal, company_name_mphone, any other intermediate columns
-    pass

From c340718b8feb500f0ea3bac3cf4056346f248639 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 18 Dec 2024 15:31:53 -0800
Subject: [PATCH 156/161] add drop duplicates on sec company id

---
 .../models/sec_eia_record_linkage/transform_sec_input.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
index a825e2b..666f010 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py
@@ -377,7 +377,7 @@ def out_sec_10k__parents_and_subsidiaries(
         unmatched_eia_df[["utility_id_eia", "company_name"]],
         how="left",
         on="company_name",
-    )
+    ).drop_duplicates(subset="sec_company_id")
     logger.info(
         f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}"
     )

From 24de7d61127458c6b4972d24bfc1f1dafdcab74f Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Thu, 19 Dec 2024 11:17:44 -0800
Subject: [PATCH 157/161] clean up notbook

---
 environment.yml                               |    1 -
 notebooks/18-kl-splink-sec-eia.ipynb          | 2216 ++++++++---------
 .../20-kl-validate-sec-output-table.ipynb     |  280 ++-
 src/mozilla_sec_eia/models/sec10k/__init__.py |    1 -
 4 files changed, 1327 insertions(+), 1171 deletions(-)

diff --git a/environment.yml b/environment.yml
index 33b1e04..89eadc8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,5 +29,4 @@ dependencies:
 
   # Use pip to install the package defined by this repo for development:
   - pip:
-      # - git+https://github.com/catalyst-cooperative/pudl.git@main
       - --editable ./[dev,docs,tests,types]
diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 8299b9f..a105e3b 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "1107fe42-197c-4fea-9c48-06d08699af0b",
    "metadata": {},
    "outputs": [],
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb",
    "metadata": {},
    "outputs": [],
@@ -64,193 +64,6 @@
     "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>record_id</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>utility_id_eia</th>\n",
-       "      <th>utility_id_pudl</th>\n",
-       "      <th>company_name_raw</th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>city</th>\n",
-       "      <th>state</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>plants_reported_owner</th>\n",
-       "      <th>plants_reported_operator</th>\n",
-       "      <th>plants_reported_asset_manager</th>\n",
-       "      <th>plants_reported_other_relationship</th>\n",
-       "      <th>entity_type</th>\n",
-       "      <th>attention_line</th>\n",
-       "      <th>street_address_2</th>\n",
-       "      <th>zip_code_4</th>\n",
-       "      <th>contact_firstname</th>\n",
-       "      <th>contact_lastname</th>\n",
-       "      <th>contact_title</th>\n",
-       "      <th>phone_number</th>\n",
-       "      <th>phone_extension</th>\n",
-       "      <th>contact_firstname_2</th>\n",
-       "      <th>contact_lastname_2</th>\n",
-       "      <th>contact_title_2</th>\n",
-       "      <th>phone_number_2</th>\n",
-       "      <th>phone_extension_2</th>\n",
-       "      <th>data_maturity</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>company_name_no_legal</th>\n",
-       "      <th>company_name_mphone</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>0ham wham8 solar limited liability company</td>\n",
-       "      <td>100 california st suite 400</td>\n",
-       "      <td>64380</td>\n",
-       "      <td>8321.0</td>\n",
-       "      <td>0ham wham8 solar, llc</td>\n",
-       "      <td>2023-01-01</td>\n",
-       "      <td>san francisco</td>\n",
-       "      <td>ca</td>\n",
-       "      <td>94118</td>\n",
-       "      <td>True</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Q</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>0ham wham8 solar</td>\n",
-       "      <td>HM HM SLR</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>10 briggs solar ng limited liability company</td>\n",
-       "      <td>267 water st 2nd floor</td>\n",
-       "      <td>62685</td>\n",
-       "      <td>8502.0</td>\n",
-       "      <td>10 briggs solar ng, llc</td>\n",
-       "      <td>2020-01-01</td>\n",
-       "      <td>warren</td>\n",
-       "      <td>ri</td>\n",
-       "      <td>02885</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Q</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "      <td>2020</td>\n",
-       "      <td>10 briggs solar ng</td>\n",
-       "      <td>BRKS SLR NK</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>1001 ebenezer church solar limited liability c...</td>\n",
-       "      <td>176 ebenezer church rd</td>\n",
-       "      <td>63186</td>\n",
-       "      <td>8567.0</td>\n",
-       "      <td>1001 ebenezer church solar, llc</td>\n",
-       "      <td>2020-01-01</td>\n",
-       "      <td>state road</td>\n",
-       "      <td>nc</td>\n",
-       "      <td>28676</td>\n",
-       "      <td>True</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Q</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>final</td>\n",
-       "      <td>2020</td>\n",
-       "      <td>1001 ebenezer church solar</td>\n",
-       "      <td>EBNSR XRX SLR</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   record_id                                       company_name               street_address  utility_id_eia  utility_id_pudl                 company_name_raw report_date           city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line street_address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity  report_year       company_name_no_legal company_name_mphone\n",
-       "0          0         0ham wham8 solar limited liability company  100 california st suite 400           64380           8321.0            0ham wham8 solar, llc  2023-01-01  san francisco    ca    94118                  True                     None                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2023            0ham wham8 solar           HM HM SLR\n",
-       "1          1       10 briggs solar ng limited liability company       267 water st 2nd floor           62685           8502.0          10 briggs solar ng, llc  2020-01-01         warren    ri    02885                  True                     True                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2020          10 briggs solar ng         BRKS SLR NK\n",
-       "2          2  1001 ebenezer church solar limited liability c...       176 ebenezer church rd           63186           8567.0  1001 ebenezer church solar, llc  2020-01-01     state road    nc    28676                  True                     None                          None                               None           Q           None             None       None              None             None          None         None            None                None               None            None           None              None         final         2020  1001 ebenezer church solar       EBNSR XRX SLR"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "eia_df.head(3)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 4,
@@ -282,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 5,
    "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6",
    "metadata": {},
    "outputs": [],
@@ -292,182 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
-   "id": "a5ea9e1d-3afd-466f-a506-ecb3f23605c9",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>record_id</th>\n",
-       "      <th>company_name</th>\n",
-       "      <th>street_address</th>\n",
-       "      <th>filename</th>\n",
-       "      <th>phone_number</th>\n",
-       "      <th>central_index_key</th>\n",
-       "      <th>city</th>\n",
-       "      <th>company_name_raw</th>\n",
-       "      <th>date_of_name_change</th>\n",
-       "      <th>film_number</th>\n",
-       "      <th>fiscal_year_end</th>\n",
-       "      <th>form_type</th>\n",
-       "      <th>former_conformed_name</th>\n",
-       "      <th>irs_number</th>\n",
-       "      <th>organization_name</th>\n",
-       "      <th>sec_act</th>\n",
-       "      <th>sec_file_number</th>\n",
-       "      <th>standard_industrial_classification</th>\n",
-       "      <th>state</th>\n",
-       "      <th>state_of_incorporation</th>\n",
-       "      <th>street_address_2</th>\n",
-       "      <th>zip_code</th>\n",
-       "      <th>report_date</th>\n",
-       "      <th>report_year</th>\n",
-       "      <th>location_of_inc</th>\n",
-       "      <th>company_name_no_legal</th>\n",
-       "      <th>company_name_mphone</th>\n",
-       "      <th>files_10k</th>\n",
-       "      <th>sec_company_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>024 pharma incorporated</td>\n",
-       "      <td>224 datura st</td>\n",
-       "      <td>edgar/data/1307969/0001683168-17-000653.txt</td>\n",
-       "      <td>(732) 696-9333</td>\n",
-       "      <td>0001307969</td>\n",
-       "      <td>west palm beach</td>\n",
-       "      <td>024 pharma, inc.</td>\n",
-       "      <td>20091202</td>\n",
-       "      <td>17711535</td>\n",
-       "      <td>1231</td>\n",
-       "      <td>10-k</td>\n",
-       "      <td>b green innovations, inc.</td>\n",
-       "      <td>201862731</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1934 act</td>\n",
-       "      <td>333-120490</td>\n",
-       "      <td>plastics products, nec [3089]</td>\n",
-       "      <td>fl</td>\n",
-       "      <td>nj</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>33401</td>\n",
-       "      <td>2017-03-24</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>new jersey</td>\n",
-       "      <td>024 pharma</td>\n",
-       "      <td>FRM</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0001307969</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>1 800 contacts incorporated</td>\n",
-       "      <td>13751 s wadsworth park dr suite d140</td>\n",
-       "      <td>edgar/data/1050122/0001104659-06-017311.txt</td>\n",
-       "      <td>8015728225</td>\n",
-       "      <td>0001050122</td>\n",
-       "      <td>draper</td>\n",
-       "      <td>1 800 contacts inc</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>06691791</td>\n",
-       "      <td>1231</td>\n",
-       "      <td>10-k</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>870571643</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1934 act</td>\n",
-       "      <td>000-23633</td>\n",
-       "      <td>retail-catalog &amp; mail-order houses [5961]</td>\n",
-       "      <td>ut</td>\n",
-       "      <td>de</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>84020</td>\n",
-       "      <td>2006-03-16</td>\n",
-       "      <td>2006</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>1 800 contacts</td>\n",
-       "      <td>KNTKTS</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0001050122</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>1 800 contacts incorporated</td>\n",
-       "      <td>66 e wadsworth park dr</td>\n",
-       "      <td>edgar/data/1050122/0001104659-07-019474.txt</td>\n",
-       "      <td>801-316-5000</td>\n",
-       "      <td>0001050122</td>\n",
-       "      <td>draper</td>\n",
-       "      <td>1 800 contacts inc</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>07696033</td>\n",
-       "      <td>1231</td>\n",
-       "      <td>10-k</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>870571643</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1934 act</td>\n",
-       "      <td>000-23633</td>\n",
-       "      <td>retail-catalog &amp; mail-order houses [5961]</td>\n",
-       "      <td>ut</td>\n",
-       "      <td>de</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>84020</td>\n",
-       "      <td>2007-03-15</td>\n",
-       "      <td>2007</td>\n",
-       "      <td>delaware</td>\n",
-       "      <td>1 800 contacts</td>\n",
-       "      <td>KNTKTS</td>\n",
-       "      <td>True</td>\n",
-       "      <td>0001050122</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   record_id                 company_name                        street_address                                     filename    phone_number central_index_key             city    company_name_raw date_of_name_change film_number fiscal_year_end form_type      former_conformed_name irs_number organization_name   sec_act sec_file_number         standard_industrial_classification state state_of_incorporation street_address_2 zip_code report_date  report_year location_of_inc company_name_no_legal company_name_mphone  files_10k sec_company_id\n",
-       "0          0      024 pharma incorporated                         224 datura st  edgar/data/1307969/0001683168-17-000653.txt  (732) 696-9333        0001307969  west palm beach    024 pharma, inc.            20091202    17711535            1231      10-k  b green innovations, inc.  201862731               NaN  1934 act      333-120490              plastics products, nec [3089]    fl                     nj              NaN    33401  2017-03-24         2017      new jersey            024 pharma                 FRM       True     0001307969\n",
-       "1          1  1 800 contacts incorporated  13751 s wadsworth park dr suite d140  edgar/data/1050122/0001104659-06-017311.txt      8015728225        0001050122           draper  1 800 contacts inc                 NaN    06691791            1231      10-k                        NaN  870571643               NaN  1934 act       000-23633  retail-catalog & mail-order houses [5961]    ut                     de              NaN    84020  2006-03-16         2006        delaware        1 800 contacts              KNTKTS       True     0001050122\n",
-       "2          2  1 800 contacts incorporated                66 e wadsworth park dr  edgar/data/1050122/0001104659-07-019474.txt    801-316-5000        0001050122           draper  1 800 contacts inc                 NaN    07696033            1231      10-k                        NaN  870571643               NaN  1934 act       000-23633  retail-catalog & mail-order houses [5961]    ut                     de              NaN    84020  2007-03-15         2007        delaware        1 800 contacts              KNTKTS       True     0001050122"
-      ]
-     },
-     "execution_count": 101,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sec_df.head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 6,
    "id": "63d97f0d-df22-4c27-b3e7-1035166b4011",
    "metadata": {},
    "outputs": [
@@ -477,7 +115,7 @@
        "61026"
       ]
      },
-     "execution_count": 102,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -499,7 +137,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 7,
    "id": "7d2d103a-2bbd-4974-b770-44626bdc5111",
    "metadata": {},
    "outputs": [],
@@ -509,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 8,
    "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27",
    "metadata": {},
    "outputs": [],
@@ -519,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 9,
    "id": "e754b2ef-5a0d-4582-8694-047528dfd339",
    "metadata": {},
    "outputs": [
@@ -529,7 +167,7 @@
        "True"
       ]
      },
-     "execution_count": 105,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -540,7 +178,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 10,
    "id": "38ad3504-2cde-455f-8896-6a435677541c",
    "metadata": {},
    "outputs": [
@@ -550,7 +188,7 @@
        "True"
       ]
      },
-     "execution_count": 106,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -561,7 +199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 12,
    "id": "856c14d8-3250-4650-a2db-3808b4718f19",
    "metadata": {},
    "outputs": [
@@ -571,14 +209,13 @@
        "False"
       ]
      },
-     "execution_count": 107,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n",
-    "# later we'll flatten on sec_company_id and utility_id_eia\n",
     "sec_df.sec_company_id.is_unique"
    ]
   },
@@ -587,12 +224,12 @@
    "id": "b18fef7e-c316-4c90-b2bc-04706401135e",
    "metadata": {},
    "source": [
-    "There can be duplicate records because sometimes a company changes utility ID or central index key over time. Keep the most recent version of that record."
+    "There should probably be no duplicate record, but if there are, keep the most recent version of that record."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 19,
    "id": "842fa02e-5202-445c-b728-72bce42e740d",
    "metadata": {},
    "outputs": [
@@ -603,7 +240,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 108,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -614,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 20,
    "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
    "metadata": {},
    "outputs": [
@@ -625,7 +262,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 109,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -636,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 253,
+   "execution_count": 18,
    "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669",
    "metadata": {},
    "outputs": [],
@@ -663,7 +300,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 21,
    "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
    "metadata": {},
    "outputs": [],
@@ -673,7 +310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 22,
    "id": "4bab1568-6a55-427c-9a78-e44db8b0584d",
    "metadata": {},
    "outputs": [
@@ -682,23 +319,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed {\n",
+       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed details,\n",
-       "  #altair-viz-bffae9d64118401bb4629bbba335e3e7.vega-embed details summary {\n",
+       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed details,\n",
+       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-bffae9d64118401bb4629bbba335e3e7\"></div>\n",
+       "<div id=\"altair-viz-238a2fea13f7415aa7121762b9fa3832\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-bffae9d64118401bb4629bbba335e3e7\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-bffae9d64118401bb4629bbba335e3e7\");\n",
+       "    if (outputDiv.id !== \"altair-viz-238a2fea13f7415aa7121762b9fa3832\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-238a2fea13f7415aa7121762b9fa3832\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -751,7 +388,7 @@
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 113,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -762,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 23,
    "id": "6b9479e3-e836-4407-a2b6-926c185065a8",
    "metadata": {},
    "outputs": [
@@ -771,23 +408,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed {\n",
+       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed details,\n",
-       "  #altair-viz-f131dd48afce49899469d187e41fd69b.vega-embed details summary {\n",
+       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed details,\n",
+       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-f131dd48afce49899469d187e41fd69b\"></div>\n",
+       "<div id=\"altair-viz-299c143177c24d0caed0a11feac611ed\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-f131dd48afce49899469d187e41fd69b\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-f131dd48afce49899469d187e41fd69b\");\n",
+       "    if (outputDiv.id !== \"altair-viz-299c143177c24d0caed0a11feac611ed\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-299c143177c24d0caed0a11feac611ed\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -840,7 +477,7 @@
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 114,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -851,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 24,
    "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
    "metadata": {},
    "outputs": [
@@ -860,23 +497,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed {\n",
+       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed details,\n",
-       "  #altair-viz-f82e60d2f54945b9a271aed10e3561ac.vega-embed details summary {\n",
+       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed details,\n",
+       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-f82e60d2f54945b9a271aed10e3561ac\"></div>\n",
+       "<div id=\"altair-viz-e461e2802f9548f4a5e8af9a3213a168\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-f82e60d2f54945b9a271aed10e3561ac\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-f82e60d2f54945b9a271aed10e3561ac\");\n",
+       "    if (outputDiv.id !== \"altair-viz-e461e2802f9548f4a5e8af9a3213a168\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-e461e2802f9548f4a5e8af9a3213a168\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -922,14 +559,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9997377991676331, \"percentile_inc_nulls\": 0.9997377991676331, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9989348649978638, \"percentile_inc_nulls\": 0.9989348649978638, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9960836172103882, \"percentile_inc_nulls\": 0.9960836172103882, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9845311641693115, \"percentile_inc_nulls\": 0.9845311641693115, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9480876922607422, \"percentile_inc_nulls\": 0.9480876922607422, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2224.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.8398387432098389, \"percentile_inc_nulls\": 0.8398387432098389, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6606.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.5750991106033325, \"percentile_inc_nulls\": 0.5750991106033325, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16156.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35096.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 46111 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"comerica inc /new/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"stillwater mining co /de/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"camelot corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 6, \"group_name\": \"_company_name_\", \"value\": \"green plains renewable energy incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"12 retech corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"11 good energy incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"1 lane technologies corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"10x capital venture acquisition corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"155 east tropicana limited liability company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 8]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8305076360702515, \"percentile_inc_nulls\": 0.8306131958961487, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.691791832447052, \"percentile_inc_nulls\": 0.6919837594032288, \"value_count\": 8460, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 8460.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.6141864061355591, \"percentile_inc_nulls\": 0.6144266128540039, \"value_count\": 4733, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 4733.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.563307523727417, \"percentile_inc_nulls\": 0.5635794401168823, \"value_count\": 3103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.5253000259399414, \"percentile_inc_nulls\": 0.5255956649780273, \"value_count\": 2318, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2318.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.49045711755752563, \"percentile_inc_nulls\": 0.4907744526863098, \"value_count\": 2125, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2125.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4584508538246155, \"percentile_inc_nulls\": 0.45878803730010986, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.42987143993377686, \"percentile_inc_nulls\": 0.4302264451980591, \"value_count\": 1743, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1743.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4037187695503235, \"percentile_inc_nulls\": 0.4040900468826294, \"value_count\": 1595, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1595.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3788614273071289, \"percentile_inc_nulls\": 0.3792482018470764, \"value_count\": 1516, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1516.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.35418444871902466, \"percentile_inc_nulls\": 0.35458654165267944, \"value_count\": 1505, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1505.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.33116352558135986, \"percentile_inc_nulls\": 0.3315799832344055, \"value_count\": 1404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3082737326622009, \"percentile_inc_nulls\": 0.3087044954299927, \"value_count\": 1396, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1396.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.2873516082763672, \"percentile_inc_nulls\": 0.28779536485671997, \"value_count\": 1276, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1276.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.26756083965301514, \"percentile_inc_nulls\": 0.2680169343948364, \"value_count\": 1207, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1207.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.24914735555648804, \"percentile_inc_nulls\": 0.2496148943901062, \"value_count\": 1123, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1123.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.23243916034698486, \"percentile_inc_nulls\": 0.23291712999343872, \"value_count\": 1019, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.21587854623794556, \"percentile_inc_nulls\": 0.21636676788330078, \"value_count\": 1010, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.20199054479599, \"percentile_inc_nulls\": 0.20248746871948242, \"value_count\": 847, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 847.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.18875843286514282, \"percentile_inc_nulls\": 0.18926358222961426, \"value_count\": 807, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 807.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.17613303661346436, \"percentile_inc_nulls\": 0.17664599418640137, \"value_count\": 770, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 770.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.1640486717224121, \"percentile_inc_nulls\": 0.1645691990852356, \"value_count\": 737, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 737.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.15265297889709473, \"percentile_inc_nulls\": 0.15318059921264648, \"value_count\": 695, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 695.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.14209353923797607, \"percentile_inc_nulls\": 0.14262771606445312, \"value_count\": 644, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.13258343935012817, \"percentile_inc_nulls\": 0.13312357664108276, \"value_count\": 580, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.12354886531829834, \"percentile_inc_nulls\": 0.12409466505050659, \"value_count\": 551, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 551.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.11457991600036621, \"percentile_inc_nulls\": 0.11513125896453857, \"value_count\": 547, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 547.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10720139741897583, \"percentile_inc_nulls\": 0.1077573299407959, \"value_count\": 450, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10013443231582642, \"percentile_inc_nulls\": 0.10069477558135986, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0935102105140686, \"percentile_inc_nulls\": 0.09407466650009155, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08788615465164185, \"percentile_inc_nulls\": 0.08845412731170654, \"value_count\": 343, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08291792869567871, \"percentile_inc_nulls\": 0.08348900079727173, \"value_count\": 303, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07842528820037842, \"percentile_inc_nulls\": 0.0789991021156311, \"value_count\": 274, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07416212558746338, \"percentile_inc_nulls\": 0.07473862171173096, \"value_count\": 260, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07007938623428345, \"percentile_inc_nulls\": 0.07065838575363159, \"value_count\": 249, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.06240570545196533, \"percentile_inc_nulls\": 0.06298953294754028, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0587000846862793, \"percentile_inc_nulls\": 0.05928617715835571, \"value_count\": 226, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05512559413909912, \"percentile_inc_nulls\": 0.055713951587677, \"value_count\": 218, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05199384689331055, \"percentile_inc_nulls\": 0.052584171295166016, \"value_count\": 191, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.048944056034088135, \"percentile_inc_nulls\": 0.049536287784576416, \"value_count\": 186, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04597628116607666, \"percentile_inc_nulls\": 0.0465703010559082, \"value_count\": 181, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04348397254943848, \"percentile_inc_nulls\": 0.044079601764678955, \"value_count\": 152, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.041270434856414795, \"percentile_inc_nulls\": 0.04186737537384033, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03918802738189697, \"percentile_inc_nulls\": 0.039786338806152344, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.037335216999053955, \"percentile_inc_nulls\": 0.03793466091156006, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03549879789352417, \"percentile_inc_nulls\": 0.036099374294281006, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03369516134262085, \"percentile_inc_nulls\": 0.03429687023162842, \"value_count\": 110, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03195708990097046, \"percentile_inc_nulls\": 0.032559871673583984, \"value_count\": 106, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03026825189590454, \"percentile_inc_nulls\": 0.030872106552124023, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.028595805168151855, \"percentile_inc_nulls\": 0.02920067310333252, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.026972532272338867, \"percentile_inc_nulls\": 0.027578413486480713, \"value_count\": 99, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.02552962303161621, \"percentile_inc_nulls\": 0.026136398315429688, \"value_count\": 88, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.022938966751098633, \"percentile_inc_nulls\": 0.023547351360321045, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.021774768829345703, \"percentile_inc_nulls\": 0.022383928298950195, \"value_count\": 71, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.020659804344177246, \"percentile_inc_nulls\": 0.021269619464874268, \"value_count\": 68, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01956123113632202, \"percentile_inc_nulls\": 0.020171701908111572, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.018577396869659424, \"percentile_inc_nulls\": 0.019188523292541504, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.017626404762268066, \"percentile_inc_nulls\": 0.0182381272315979, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.015757203102111816, \"percentile_inc_nulls\": 0.016370058059692383, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014838993549346924, \"percentile_inc_nulls\": 0.015452444553375244, \"value_count\": 56, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014019131660461426, \"percentile_inc_nulls\": 0.014633119106292725, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0132485032081604, \"percentile_inc_nulls\": 0.013862967491149902, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.012494266033172607, \"percentile_inc_nulls\": 0.013109147548675537, \"value_count\": 46, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.011789202690124512, \"percentile_inc_nulls\": 0.012404561042785645, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01121532917022705, \"percentile_inc_nulls\": 0.011831045150756836, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010674238204956055, \"percentile_inc_nulls\": 0.011290252208709717, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010149538516998291, \"percentile_inc_nulls\": 0.010765910148620605, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009674012660980225, \"percentile_inc_nulls\": 0.010290682315826416, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009264111518859863, \"percentile_inc_nulls\": 0.009881019592285156, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008870601654052734, \"percentile_inc_nulls\": 0.009487748146057129, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008181929588317871, \"percentile_inc_nulls\": 0.008799552917480469, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007870376110076904, \"percentile_inc_nulls\": 0.008488178253173828, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007575273513793945, \"percentile_inc_nulls\": 0.008193254470825195, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007050573825836182, \"percentile_inc_nulls\": 0.007668852806091309, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00680464506149292, \"percentile_inc_nulls\": 0.007423043251037598, \"value_count\": 15, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.006115972995758057, \"percentile_inc_nulls\": 0.0067348480224609375, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00590282678604126, \"percentile_inc_nulls\": 0.006521821022033691, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00570601224899292, \"percentile_inc_nulls\": 0.006325185298919678, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.005164921283721924, \"percentile_inc_nulls\": 0.005784392356872559, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0041811466217041016, \"percentile_inc_nulls\": 0.00480121374130249, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003885984420776367, \"percentile_inc_nulls\": 0.004506289958953857, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003623664379119873, \"percentile_inc_nulls\": 0.004244089126586914, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0035088658332824707, \"percentile_inc_nulls\": 0.0041294097900390625, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.002721846103668213, \"percentile_inc_nulls\": 0.003342866897583008, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0019840002059936523, \"percentile_inc_nulls\": 0.002605438232421875, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0012625455856323242, \"percentile_inc_nulls\": 0.00188446044921875, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0007214546203613281, \"percentile_inc_nulls\": 0.0013436675071716309, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0003935098648071289, \"percentile_inc_nulls\": 0.0010159611701965332, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0006226897239685059, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 38 values (0.1%) are null and there are 172 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10337, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 8460, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 4733, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 3103, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2318, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2125, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1952, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1743, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1595, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1516, \"group_name\": \"_state_\", \"value\": \"nv\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"s9\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"2a\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10337]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8892185091972351, \"percentile_inc_nulls\": 0.8892439603805542, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8651740550994873, \"percentile_inc_nulls\": 0.8652049899101257, \"value_count\": 1467, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1467.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8483904600143433, \"percentile_inc_nulls\": 0.8484252691268921, \"value_count\": 1024, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1024.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8343932628631592, \"percentile_inc_nulls\": 0.8344312310218811, \"value_count\": 854, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 854.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8206254243850708, \"percentile_inc_nulls\": 0.8206666111946106, \"value_count\": 840, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8072838187217712, \"percentile_inc_nulls\": 0.8073280453681946, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7944011092185974, \"percentile_inc_nulls\": 0.7944482564926147, \"value_count\": 786, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7821412086486816, \"percentile_inc_nulls\": 0.7821912169456482, \"value_count\": 748, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7701107859611511, \"percentile_inc_nulls\": 0.7701635360717773, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7582606673240662, \"percentile_inc_nulls\": 0.758316159248352, \"value_count\": 723, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 723.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7466564178466797, \"percentile_inc_nulls\": 0.7467144727706909, \"value_count\": 708, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 708.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7364616394042969, \"percentile_inc_nulls\": 0.7365221381187439, \"value_count\": 622, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 622.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7267422676086426, \"percentile_inc_nulls\": 0.7268049716949463, \"value_count\": 593, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 593.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7170720100402832, \"percentile_inc_nulls\": 0.7171369791030884, \"value_count\": 590, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7074673771858215, \"percentile_inc_nulls\": 0.7075344920158386, \"value_count\": 586, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6978955268859863, \"percentile_inc_nulls\": 0.6979647874832153, \"value_count\": 584, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.688372790813446, \"percentile_inc_nulls\": 0.6884442567825317, \"value_count\": 581, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6793090105056763, \"percentile_inc_nulls\": 0.6793825626373291, \"value_count\": 553, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 553.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6723759174346924, \"percentile_inc_nulls\": 0.6724510788917542, \"value_count\": 423, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6655576229095459, \"percentile_inc_nulls\": 0.6656343340873718, \"value_count\": 416, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6590179204940796, \"percentile_inc_nulls\": 0.6590961217880249, \"value_count\": 399, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6526912450790405, \"percentile_inc_nulls\": 0.6527709364891052, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6471513509750366, \"percentile_inc_nulls\": 0.6472322940826416, \"value_count\": 338, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6416934728622437, \"percentile_inc_nulls\": 0.6417756080627441, \"value_count\": 333, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6312692165374756, \"percentile_inc_nulls\": 0.6313538551330566, \"value_count\": 318, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.626384973526001, \"percentile_inc_nulls\": 0.626470685005188, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6217301487922668, \"percentile_inc_nulls\": 0.6218169331550598, \"value_count\": 284, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.612518846988678, \"percentile_inc_nulls\": 0.6126077175140381, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6079951524734497, \"percentile_inc_nulls\": 0.608085036277771, \"value_count\": 276, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.603569746017456, \"percentile_inc_nulls\": 0.6036607027053833, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5992263555526733, \"percentile_inc_nulls\": 0.5993183255195618, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5948993563652039, \"percentile_inc_nulls\": 0.5949922800064087, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5908181667327881, \"percentile_inc_nulls\": 0.5909121036529541, \"value_count\": 249, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5869500637054443, \"percentile_inc_nulls\": 0.5870448350906372, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5831148028373718, \"percentile_inc_nulls\": 0.5832104682922363, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.575706422328949, \"percentile_inc_nulls\": 0.5758037567138672, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5720186233520508, \"percentile_inc_nulls\": 0.5721167922019958, \"value_count\": 225, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5685439109802246, \"percentile_inc_nulls\": 0.5686428546905518, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5616599917411804, \"percentile_inc_nulls\": 0.5617605447769165, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.558234453201294, \"percentile_inc_nulls\": 0.5583357810974121, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5548253059387207, \"percentile_inc_nulls\": 0.554927408695221, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5514816641807556, \"percentile_inc_nulls\": 0.5515846014022827, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5482856035232544, \"percentile_inc_nulls\": 0.548389196395874, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5451222658157349, \"percentile_inc_nulls\": 0.5452266335487366, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5419917106628418, \"percentile_inc_nulls\": 0.5420968532562256, \"value_count\": 191, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5389431715011597, \"percentile_inc_nulls\": 0.5390489101409912, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5359601378440857, \"percentile_inc_nulls\": 0.5360665917396545, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.532993495464325, \"percentile_inc_nulls\": 0.5331006050109863, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5302727222442627, \"percentile_inc_nulls\": 0.5303804874420166, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5275847315788269, \"percentile_inc_nulls\": 0.5276931524276733, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5249786972999573, \"percentile_inc_nulls\": 0.5250876545906067, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5224218368530273, \"percentile_inc_nulls\": 0.5225313901901245, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5198976993560791, \"percentile_inc_nulls\": 0.520007848739624, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5173900127410889, \"percentile_inc_nulls\": 0.5175007581710815, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5148987174034119, \"percentile_inc_nulls\": 0.5150099992752075, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5124237537384033, \"percentile_inc_nulls\": 0.5125356316566467, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5099652409553528, \"percentile_inc_nulls\": 0.510077714920044, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5075395107269287, \"percentile_inc_nulls\": 0.5076524615287781, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5051465034484863, \"percentile_inc_nulls\": 0.5052600502967834, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4980167746543884, \"percentile_inc_nulls\": 0.4981319308280945, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48621582984924316, \"percentile_inc_nulls\": 0.4863336682319641, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4838883876800537, \"percentile_inc_nulls\": 0.4840068221092224, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48161017894744873, \"percentile_inc_nulls\": 0.48172909021377563, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47936469316482544, \"percentile_inc_nulls\": 0.4794841408729553, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47713565826416016, \"percentile_inc_nulls\": 0.47725558280944824, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4749557375907898, \"percentile_inc_nulls\": 0.47507619857788086, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47279220819473267, \"percentile_inc_nulls\": 0.4729132056236267, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4706615209579468, \"percentile_inc_nulls\": 0.47078293561935425, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4685635566711426, \"percentile_inc_nulls\": 0.468685507774353, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4664819836616516, \"percentile_inc_nulls\": 0.46660441160202026, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4644496440887451, \"percentile_inc_nulls\": 0.4645724892616272, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4624336361885071, \"percentile_inc_nulls\": 0.46255695819854736, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4604504108428955, \"percentile_inc_nulls\": 0.460574209690094, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45848357677459717, \"percentile_inc_nulls\": 0.4586077928543091, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45458269119262695, \"percentile_inc_nulls\": 0.4547078013420105, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4526650309562683, \"percentile_inc_nulls\": 0.45279061794281006, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45078015327453613, \"percentile_inc_nulls\": 0.4509061574935913, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.445223867893219, \"percentile_inc_nulls\": 0.44535118341445923, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4416508078575134, \"percentile_inc_nulls\": 0.44177889823913574, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4398806691169739, \"percentile_inc_nulls\": 0.4400091767311096, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43812692165374756, \"percentile_inc_nulls\": 0.43825584650039673, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43468499183654785, \"percentile_inc_nulls\": 0.4348146915435791, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4314069151878357, \"percentile_inc_nulls\": 0.431537389755249, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42978429794311523, \"percentile_inc_nulls\": 0.4299151301383972, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.428178071975708, \"percentile_inc_nulls\": 0.42830926179885864, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42499834299087524, \"percentile_inc_nulls\": 0.4251302480697632, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4234248995780945, \"percentile_inc_nulls\": 0.4235571622848511, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4218842387199402, \"percentile_inc_nulls\": 0.42201685905456543, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4203763008117676, \"percentile_inc_nulls\": 0.4205092787742615, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.418884813785553, \"percentile_inc_nulls\": 0.4190181493759155, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4145086407661438, \"percentile_inc_nulls\": 0.41464293003082275, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4131154417991638, \"percentile_inc_nulls\": 0.4132500886917114, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4103618860244751, \"percentile_inc_nulls\": 0.41049718856811523, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40764111280441284, \"percentile_inc_nulls\": 0.40777701139450073, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40495312213897705, \"percentile_inc_nulls\": 0.4050896167755127, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4022979140281677, \"percentile_inc_nulls\": 0.4024350047111511, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4009866714477539, \"percentile_inc_nulls\": 0.40112411975860596, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3983970284461975, \"percentile_inc_nulls\": 0.3985350728034973, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3971513509750366, \"percentile_inc_nulls\": 0.3972896933555603, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3934635519981384, \"percentile_inc_nulls\": 0.39360272884368896, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.39103782176971436, \"percentile_inc_nulls\": 0.39117753505706787, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3863174319267273, \"percentile_inc_nulls\": 0.38645821809768677, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38515371084213257, \"percentile_inc_nulls\": 0.3852947950363159, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38400644063949585, \"percentile_inc_nulls\": 0.3841477632522583, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38061362504959106, \"percentile_inc_nulls\": 0.3807557225227356, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3783845901489258, \"percentile_inc_nulls\": 0.3785271644592285, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3773028254508972, \"percentile_inc_nulls\": 0.37744569778442383, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3741067051887512, \"percentile_inc_nulls\": 0.37425029277801514, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3730577826499939, \"percentile_inc_nulls\": 0.3732016086578369, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.37202519178390503, \"percentile_inc_nulls\": 0.37216925621032715, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3710089921951294, \"percentile_inc_nulls\": 0.3711532950401306, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3690093755722046, \"percentile_inc_nulls\": 0.369154155254364, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3660591244697571, \"percentile_inc_nulls\": 0.36620455980300903, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.36130595207214355, \"percentile_inc_nulls\": 0.36145251989364624, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3594374656677246, \"percentile_inc_nulls\": 0.3595844507217407, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35851961374282837, \"percentile_inc_nulls\": 0.3586667776107788, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35671669244766235, \"percentile_inc_nulls\": 0.3568642735481262, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3549465537071228, \"percentile_inc_nulls\": 0.3550945520401001, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35147184133529663, \"percentile_inc_nulls\": 0.351620614528656, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3489149808883667, \"percentile_inc_nulls\": 0.34906435012817383, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.34724318981170654, \"percentile_inc_nulls\": 0.3473929166793823, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3439651131629944, \"percentile_inc_nulls\": 0.34411561489105225, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33994948863983154, \"percentile_inc_nulls\": 0.3401009440422058, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3344423770904541, \"percentile_inc_nulls\": 0.33459508419036865, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33213138580322266, \"percentile_inc_nulls\": 0.3322846293449402, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3291155695915222, \"percentile_inc_nulls\": 0.3292694687843323, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3261653184890747, \"percentile_inc_nulls\": 0.32631993293762207, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3211171627044678, \"percentile_inc_nulls\": 0.32127290964126587, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3168884515762329, \"percentile_inc_nulls\": 0.3170452117919922, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.31482332944869995, \"percentile_inc_nulls\": 0.31498050689697266, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3101193308830261, \"percentile_inc_nulls\": 0.3102775812149048, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30749690532684326, \"percentile_inc_nulls\": 0.3076557517051697, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3030223846435547, \"percentile_inc_nulls\": 0.3031822443008423, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30115389823913574, \"percentile_inc_nulls\": 0.30131417512893677, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2987281084060669, \"percentile_inc_nulls\": 0.2988889813423157, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2957778573036194, \"percentile_inc_nulls\": 0.29593944549560547, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.29348325729370117, \"percentile_inc_nulls\": 0.29364532232284546, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2884678244590759, \"percentile_inc_nulls\": 0.2886310815811157, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.285222589969635, \"percentile_inc_nulls\": 0.2853865623474121, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2831246256828308, \"percentile_inc_nulls\": 0.2832890748977661, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2770274877548218, \"percentile_inc_nulls\": 0.2771933078765869, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2735854983329773, \"percentile_inc_nulls\": 0.2737521529197693, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2726348638534546, \"percentile_inc_nulls\": 0.2728017568588257, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26942241191864014, \"percentile_inc_nulls\": 0.26959002017974854, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26632463932037354, \"percentile_inc_nulls\": 0.26649296283721924, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26078474521636963, \"percentile_inc_nulls\": 0.2609543204307556, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.25586771965026855, \"percentile_inc_nulls\": 0.2560384273529053, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24996721744537354, \"percentile_inc_nulls\": 0.2501392960548401, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24544352293014526, \"percentile_inc_nulls\": 0.245616614818573, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23895299434661865, \"percentile_inc_nulls\": 0.23912757635116577, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23551106452941895, \"percentile_inc_nulls\": 0.23568642139434814, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2273159623146057, \"percentile_inc_nulls\": 0.22749322652816772, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2207762598991394, \"percentile_inc_nulls\": 0.2209550142288208, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2142857313156128, \"percentile_inc_nulls\": 0.21446597576141357, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2109420895576477, \"percentile_inc_nulls\": 0.21112310886383057, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.20648396015167236, \"percentile_inc_nulls\": 0.2066659927368164, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.1981249451637268, \"percentile_inc_nulls\": 0.19830894470214844, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.19101160764694214, \"percentile_inc_nulls\": 0.1911972165107727, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.18227559328079224, \"percentile_inc_nulls\": 0.18246322870254517, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.17460501194000244, \"percentile_inc_nulls\": 0.17479437589645386, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.16559040546417236, \"percentile_inc_nulls\": 0.16578179597854614, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 550.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.15854257345199585, \"percentile_inc_nulls\": 0.15873563289642334, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.14836424589157104, \"percentile_inc_nulls\": 0.14855962991714478, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.13669443130493164, \"percentile_inc_nulls\": 0.13689249753952026, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.12717169523239136, \"percentile_inc_nulls\": 0.12737196683883667, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.11497735977172852, \"percentile_inc_nulls\": 0.11518043279647827, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.09989839792251587, \"percentile_inc_nulls\": 0.10010486841201782, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0844915509223938, \"percentile_inc_nulls\": 0.08470159769058228, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 940.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.06585592031478882, \"percentile_inc_nulls\": 0.06607019901275635, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.040352702140808105, \"percentile_inc_nulls\": 0.0405728816986084, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00022941827774047852, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2462.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14 values (0.0%) are null and there are 5121 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 6759, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1467, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1024, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 854, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 840, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 814, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 786, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 748, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 734, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 723, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"downes grove\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"l-1855 luxembourg\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"bnei-brak\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"beavercreek\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ft. myers,\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 6759]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9869236350059509, \"percentile_inc_nulls\": 0.9869236350059509, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9778782725334167, \"percentile_inc_nulls\": 0.9778782725334167, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9721266627311707, \"percentile_inc_nulls\": 0.9721266627311707, \"value_count\": 351, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9664405584335327, \"percentile_inc_nulls\": 0.9664405584335327, \"value_count\": 347, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9614754319190979, \"percentile_inc_nulls\": 0.9614754319190979, \"value_count\": 303, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9569363594055176, \"percentile_inc_nulls\": 0.9569363594055176, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9525775909423828, \"percentile_inc_nulls\": 0.9525775909423828, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9486120939254761, \"percentile_inc_nulls\": 0.9486120939254761, \"value_count\": 242, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9448267817497253, \"percentile_inc_nulls\": 0.9448267817497253, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9411562085151672, \"percentile_inc_nulls\": 0.9411562085151672, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.937616765499115, \"percentile_inc_nulls\": 0.937616765499115, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9341428279876709, \"percentile_inc_nulls\": 0.9341428279876709, \"value_count\": 212, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.930881917476654, \"percentile_inc_nulls\": 0.930881917476654, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9276373982429504, \"percentile_inc_nulls\": 0.9276373982429504, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9246222972869873, \"percentile_inc_nulls\": 0.9246222972869873, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9217710494995117, \"percentile_inc_nulls\": 0.9217710494995117, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9162651896476746, \"percentile_inc_nulls\": 0.9162651896476746, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9136925339698792, \"percentile_inc_nulls\": 0.9136925339698792, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9112181663513184, \"percentile_inc_nulls\": 0.9112181663513184, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9088093638420105, \"percentile_inc_nulls\": 0.9088093638420105, \"value_count\": 147, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9065152406692505, \"percentile_inc_nulls\": 0.9065152406692505, \"value_count\": 140, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9042539000511169, \"percentile_inc_nulls\": 0.9042539000511169, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9021236896514893, \"percentile_inc_nulls\": 0.9021236896514893, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9001245498657227, \"percentile_inc_nulls\": 0.9001245498657227, \"value_count\": 122, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8982564806938171, \"percentile_inc_nulls\": 0.8982564806938171, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8964048027992249, \"percentile_inc_nulls\": 0.8964048027992249, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8946186900138855, \"percentile_inc_nulls\": 0.8946186900138855, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8929636478424072, \"percentile_inc_nulls\": 0.8929636478424072, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8913413882255554, \"percentile_inc_nulls\": 0.8913413882255554, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8897355198860168, \"percentile_inc_nulls\": 0.8897355198860168, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8866220712661743, \"percentile_inc_nulls\": 0.8866220712661743, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8850981593132019, \"percentile_inc_nulls\": 0.8850981593132019, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8836561441421509, \"percentile_inc_nulls\": 0.8836561441421509, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8822305202484131, \"percentile_inc_nulls\": 0.8822305202484131, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8810015320777893, \"percentile_inc_nulls\": 0.8810015320777893, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8798708915710449, \"percentile_inc_nulls\": 0.8798708915710449, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.877707839012146, \"percentile_inc_nulls\": 0.877707839012146, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8766427636146545, \"percentile_inc_nulls\": 0.8766427636146545, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8756431937217712, \"percentile_inc_nulls\": 0.8756431937217712, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8746927380561829, \"percentile_inc_nulls\": 0.8746927380561829, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8737751245498657, \"percentile_inc_nulls\": 0.8737751245498657, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8728902339935303, \"percentile_inc_nulls\": 0.8728902339935303, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8711532950401306, \"percentile_inc_nulls\": 0.8711532950401306, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8703175783157349, \"percentile_inc_nulls\": 0.8703175783157349, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8686789274215698, \"percentile_inc_nulls\": 0.8686789274215698, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8678759932518005, \"percentile_inc_nulls\": 0.8678759932518005, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.866401195526123, \"percentile_inc_nulls\": 0.866401195526123, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8656966090202332, \"percentile_inc_nulls\": 0.8656966090202332, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8650083541870117, \"percentile_inc_nulls\": 0.8650083541870117, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8643528819084167, \"percentile_inc_nulls\": 0.8643528819084167, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.863074779510498, \"percentile_inc_nulls\": 0.863074779510498, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.861829400062561, \"percentile_inc_nulls\": 0.861829400062561, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8600596189498901, \"percentile_inc_nulls\": 0.8600596189498901, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8589125871658325, \"percentile_inc_nulls\": 0.8589125871658325, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.857241153717041, \"percentile_inc_nulls\": 0.857241153717041, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.855618953704834, \"percentile_inc_nulls\": 0.855618953704834, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8545701503753662, \"percentile_inc_nulls\": 0.8545701503753662, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8530462384223938, \"percentile_inc_nulls\": 0.8530462384223938, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8525546789169312, \"percentile_inc_nulls\": 0.8525546789169312, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8511290550231934, \"percentile_inc_nulls\": 0.8511290550231934, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8492937088012695, \"percentile_inc_nulls\": 0.8492937088012695, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8470815420150757, \"percentile_inc_nulls\": 0.8470815420150757, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8453773856163025, \"percentile_inc_nulls\": 0.8453773856163025, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8408710956573486, \"percentile_inc_nulls\": 0.8408710956573486, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8389047384262085, \"percentile_inc_nulls\": 0.8389047384262085, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8373971581459045, \"percentile_inc_nulls\": 0.8373971581459045, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8363156318664551, \"percentile_inc_nulls\": 0.8363156318664551, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8345950841903687, \"percentile_inc_nulls\": 0.8345950841903687, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8319732546806335, \"percentile_inc_nulls\": 0.8319732546806335, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.828859806060791, \"percentile_inc_nulls\": 0.828859806060791, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8262052536010742, \"percentile_inc_nulls\": 0.8262052536010742, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8250909447669983, \"percentile_inc_nulls\": 0.8250909447669983, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8222069144248962, \"percentile_inc_nulls\": 0.8222069144248962, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8190115690231323, \"percentile_inc_nulls\": 0.8190115690231323, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8148821592330933, \"percentile_inc_nulls\": 0.8148821592330933, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8112607598304749, \"percentile_inc_nulls\": 0.8112607598304749, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8055583238601685, \"percentile_inc_nulls\": 0.8055583238601685, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8001507520675659, \"percentile_inc_nulls\": 0.8001507520675659, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7937600612640381, \"percentile_inc_nulls\": 0.7937600612640381, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7862386703491211, \"percentile_inc_nulls\": 0.7862386703491211, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 459.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7773244380950928, \"percentile_inc_nulls\": 0.7773244380950928, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.767918586730957, \"percentile_inc_nulls\": 0.767918586730957, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 574.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.754645586013794, \"percentile_inc_nulls\": 0.754645586013794, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7358830571174622, \"percentile_inc_nulls\": 0.7358830571174622, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1145.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7013404369354248, \"percentile_inc_nulls\": 0.7013404369354248, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.6384655833244324, \"percentile_inc_nulls\": 0.6384655833244324, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3837.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.4765673875808716, \"percentile_inc_nulls\": 0.4765673875808716, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 9880.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29083.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 36703 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 351, \"group_name\": \"_street_address_\", \"value\": \"11 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 347, \"group_name\": \"_street_address_\", \"value\": \"383 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 303, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lk blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"85 broad st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 242, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"co wilmington trust company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial ctr floor 10\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"4450 belden vlg st nw\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"two jericho plz\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"970 lk carillon dr\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"250 vly blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"one north federal hwy\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9997377991676331, \"percentile_inc_nulls\": 0.9997377991676331, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9989348649978638, \"percentile_inc_nulls\": 0.9989348649978638, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9960836172103882, \"percentile_inc_nulls\": 0.9960836172103882, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9845311641693115, \"percentile_inc_nulls\": 0.9845311641693115, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9480876922607422, \"percentile_inc_nulls\": 0.9480876922607422, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2224.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.8398387432098389, \"percentile_inc_nulls\": 0.8398387432098389, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6606.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.5750991106033325, \"percentile_inc_nulls\": 0.5750991106033325, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16156.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35096.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 46111 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"comerica inc /new/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"camelot corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"stillwater mining co /de/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 6, \"group_name\": \"_company_name_\", \"value\": \"microvision incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"moringa acquisition corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk38\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk36\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"morgan stanley capital i trust 2016 ubs11\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"morgan stanley capital i trust 2016 ubs12\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 8]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0132485032081604, \"percentile_inc_nulls\": 0.013862967491149902, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.012494266033172607, \"percentile_inc_nulls\": 0.013109147548675537, \"value_count\": 46, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.011789202690124512, \"percentile_inc_nulls\": 0.012404561042785645, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01121532917022705, \"percentile_inc_nulls\": 0.011831045150756836, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010674238204956055, \"percentile_inc_nulls\": 0.011290252208709717, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010149538516998291, \"percentile_inc_nulls\": 0.010765910148620605, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009674012660980225, \"percentile_inc_nulls\": 0.010290682315826416, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009264111518859863, \"percentile_inc_nulls\": 0.009881019592285156, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008870601654052734, \"percentile_inc_nulls\": 0.009487748146057129, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008181929588317871, \"percentile_inc_nulls\": 0.008799552917480469, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007870376110076904, \"percentile_inc_nulls\": 0.008488178253173828, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007575273513793945, \"percentile_inc_nulls\": 0.008193254470825195, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007050573825836182, \"percentile_inc_nulls\": 0.007668852806091309, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00680464506149292, \"percentile_inc_nulls\": 0.007423043251037598, \"value_count\": 15, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.006115972995758057, \"percentile_inc_nulls\": 0.0067348480224609375, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00590282678604126, \"percentile_inc_nulls\": 0.006521821022033691, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00570601224899292, \"percentile_inc_nulls\": 0.006325185298919678, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.005164921283721924, \"percentile_inc_nulls\": 0.005784392356872559, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0041811466217041016, \"percentile_inc_nulls\": 0.00480121374130249, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003885984420776367, \"percentile_inc_nulls\": 0.004506289958953857, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003623664379119873, \"percentile_inc_nulls\": 0.004244089126586914, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0035088658332824707, \"percentile_inc_nulls\": 0.0041294097900390625, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.002721846103668213, \"percentile_inc_nulls\": 0.003342866897583008, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0019840002059936523, \"percentile_inc_nulls\": 0.002605438232421875, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0012625455856323242, \"percentile_inc_nulls\": 0.00188446044921875, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0007214546203613281, \"percentile_inc_nulls\": 0.0013436675071716309, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0003935098648071289, \"percentile_inc_nulls\": 0.0010159611701965332, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0006226897239685059, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.8305076360702515, \"percentile_inc_nulls\": 0.8306131958961487, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.691791832447052, \"percentile_inc_nulls\": 0.6919837594032288, \"value_count\": 8460, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 8460.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.6141864061355591, \"percentile_inc_nulls\": 0.6144266128540039, \"value_count\": 4733, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 4733.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.563307523727417, \"percentile_inc_nulls\": 0.5635794401168823, \"value_count\": 3103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.5253000259399414, \"percentile_inc_nulls\": 0.5255956649780273, \"value_count\": 2318, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2318.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.49045711755752563, \"percentile_inc_nulls\": 0.4907744526863098, \"value_count\": 2125, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2125.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4584508538246155, \"percentile_inc_nulls\": 0.45878803730010986, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.42987143993377686, \"percentile_inc_nulls\": 0.4302264451980591, \"value_count\": 1743, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1743.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4037187695503235, \"percentile_inc_nulls\": 0.4040900468826294, \"value_count\": 1595, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1595.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3788614273071289, \"percentile_inc_nulls\": 0.3792482018470764, \"value_count\": 1516, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1516.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.35418444871902466, \"percentile_inc_nulls\": 0.35458654165267944, \"value_count\": 1505, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1505.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.33116352558135986, \"percentile_inc_nulls\": 0.3315799832344055, \"value_count\": 1404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3082737326622009, \"percentile_inc_nulls\": 0.3087044954299927, \"value_count\": 1396, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1396.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.2873516082763672, \"percentile_inc_nulls\": 0.28779536485671997, \"value_count\": 1276, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1276.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.26756083965301514, \"percentile_inc_nulls\": 0.2680169343948364, \"value_count\": 1207, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1207.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.24914735555648804, \"percentile_inc_nulls\": 0.2496148943901062, \"value_count\": 1123, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1123.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.23243916034698486, \"percentile_inc_nulls\": 0.23291712999343872, \"value_count\": 1019, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.21587854623794556, \"percentile_inc_nulls\": 0.21636676788330078, \"value_count\": 1010, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.20199054479599, \"percentile_inc_nulls\": 0.20248746871948242, \"value_count\": 847, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 847.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.18875843286514282, \"percentile_inc_nulls\": 0.18926358222961426, \"value_count\": 807, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 807.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.17613303661346436, \"percentile_inc_nulls\": 0.17664599418640137, \"value_count\": 770, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 770.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.1640486717224121, \"percentile_inc_nulls\": 0.1645691990852356, \"value_count\": 737, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 737.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.15265297889709473, \"percentile_inc_nulls\": 0.15318059921264648, \"value_count\": 695, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 695.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.14209353923797607, \"percentile_inc_nulls\": 0.14262771606445312, \"value_count\": 644, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.13258343935012817, \"percentile_inc_nulls\": 0.13312357664108276, \"value_count\": 580, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.12354886531829834, \"percentile_inc_nulls\": 0.12409466505050659, \"value_count\": 551, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 551.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.11457991600036621, \"percentile_inc_nulls\": 0.11513125896453857, \"value_count\": 547, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 547.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10720139741897583, \"percentile_inc_nulls\": 0.1077573299407959, \"value_count\": 450, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10013443231582642, \"percentile_inc_nulls\": 0.10069477558135986, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0935102105140686, \"percentile_inc_nulls\": 0.09407466650009155, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08788615465164185, \"percentile_inc_nulls\": 0.08845412731170654, \"value_count\": 343, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08291792869567871, \"percentile_inc_nulls\": 0.08348900079727173, \"value_count\": 303, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07842528820037842, \"percentile_inc_nulls\": 0.0789991021156311, \"value_count\": 274, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07416212558746338, \"percentile_inc_nulls\": 0.07473862171173096, \"value_count\": 260, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07007938623428345, \"percentile_inc_nulls\": 0.07065838575363159, \"value_count\": 249, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.06240570545196533, \"percentile_inc_nulls\": 0.06298953294754028, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0587000846862793, \"percentile_inc_nulls\": 0.05928617715835571, \"value_count\": 226, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05512559413909912, \"percentile_inc_nulls\": 0.055713951587677, \"value_count\": 218, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05199384689331055, \"percentile_inc_nulls\": 0.052584171295166016, \"value_count\": 191, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.048944056034088135, \"percentile_inc_nulls\": 0.049536287784576416, \"value_count\": 186, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04597628116607666, \"percentile_inc_nulls\": 0.0465703010559082, \"value_count\": 181, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04348397254943848, \"percentile_inc_nulls\": 0.044079601764678955, \"value_count\": 152, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.041270434856414795, \"percentile_inc_nulls\": 0.04186737537384033, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03918802738189697, \"percentile_inc_nulls\": 0.039786338806152344, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.037335216999053955, \"percentile_inc_nulls\": 0.03793466091156006, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03549879789352417, \"percentile_inc_nulls\": 0.036099374294281006, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03369516134262085, \"percentile_inc_nulls\": 0.03429687023162842, \"value_count\": 110, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03195708990097046, \"percentile_inc_nulls\": 0.032559871673583984, \"value_count\": 106, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03026825189590454, \"percentile_inc_nulls\": 0.030872106552124023, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.028595805168151855, \"percentile_inc_nulls\": 0.02920067310333252, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.026972532272338867, \"percentile_inc_nulls\": 0.027578413486480713, \"value_count\": 99, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.02552962303161621, \"percentile_inc_nulls\": 0.026136398315429688, \"value_count\": 88, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.022938966751098633, \"percentile_inc_nulls\": 0.023547351360321045, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.021774768829345703, \"percentile_inc_nulls\": 0.022383928298950195, \"value_count\": 71, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.020659804344177246, \"percentile_inc_nulls\": 0.021269619464874268, \"value_count\": 68, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01956123113632202, \"percentile_inc_nulls\": 0.020171701908111572, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.018577396869659424, \"percentile_inc_nulls\": 0.019188523292541504, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.017626404762268066, \"percentile_inc_nulls\": 0.0182381272315979, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.015757203102111816, \"percentile_inc_nulls\": 0.016370058059692383, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014838993549346924, \"percentile_inc_nulls\": 0.015452444553375244, \"value_count\": 56, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014019131660461426, \"percentile_inc_nulls\": 0.014633119106292725, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 38 values (0.1%) are null and there are 172 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10337, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 8460, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 4733, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 3103, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2318, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2125, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1952, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1743, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1595, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1516, \"group_name\": \"_state_\", \"value\": \"nv\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"s9\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10337]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.34724318981170654, \"percentile_inc_nulls\": 0.3473929166793823, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3439651131629944, \"percentile_inc_nulls\": 0.34411561489105225, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33994948863983154, \"percentile_inc_nulls\": 0.3401009440422058, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3344423770904541, \"percentile_inc_nulls\": 0.33459508419036865, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33213138580322266, \"percentile_inc_nulls\": 0.3322846293449402, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3291155695915222, \"percentile_inc_nulls\": 0.3292694687843323, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3261653184890747, \"percentile_inc_nulls\": 0.32631993293762207, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3211171627044678, \"percentile_inc_nulls\": 0.32127290964126587, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3168884515762329, \"percentile_inc_nulls\": 0.3170452117919922, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.31482332944869995, \"percentile_inc_nulls\": 0.31498050689697266, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3101193308830261, \"percentile_inc_nulls\": 0.3102775812149048, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30749690532684326, \"percentile_inc_nulls\": 0.3076557517051697, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3030223846435547, \"percentile_inc_nulls\": 0.3031822443008423, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30115389823913574, \"percentile_inc_nulls\": 0.30131417512893677, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2987281084060669, \"percentile_inc_nulls\": 0.2988889813423157, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2957778573036194, \"percentile_inc_nulls\": 0.29593944549560547, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.29348325729370117, \"percentile_inc_nulls\": 0.29364532232284546, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2884678244590759, \"percentile_inc_nulls\": 0.2886310815811157, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.285222589969635, \"percentile_inc_nulls\": 0.2853865623474121, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2831246256828308, \"percentile_inc_nulls\": 0.2832890748977661, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2770274877548218, \"percentile_inc_nulls\": 0.2771933078765869, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2735854983329773, \"percentile_inc_nulls\": 0.2737521529197693, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2726348638534546, \"percentile_inc_nulls\": 0.2728017568588257, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26942241191864014, \"percentile_inc_nulls\": 0.26959002017974854, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26632463932037354, \"percentile_inc_nulls\": 0.26649296283721924, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26078474521636963, \"percentile_inc_nulls\": 0.2609543204307556, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.25586771965026855, \"percentile_inc_nulls\": 0.2560384273529053, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24996721744537354, \"percentile_inc_nulls\": 0.2501392960548401, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24544352293014526, \"percentile_inc_nulls\": 0.245616614818573, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23895299434661865, \"percentile_inc_nulls\": 0.23912757635116577, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23551106452941895, \"percentile_inc_nulls\": 0.23568642139434814, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2273159623146057, \"percentile_inc_nulls\": 0.22749322652816772, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2207762598991394, \"percentile_inc_nulls\": 0.2209550142288208, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2142857313156128, \"percentile_inc_nulls\": 0.21446597576141357, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2109420895576477, \"percentile_inc_nulls\": 0.21112310886383057, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.20648396015167236, \"percentile_inc_nulls\": 0.2066659927368164, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.1981249451637268, \"percentile_inc_nulls\": 0.19830894470214844, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.19101160764694214, \"percentile_inc_nulls\": 0.1911972165107727, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.18227559328079224, \"percentile_inc_nulls\": 0.18246322870254517, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.17460501194000244, \"percentile_inc_nulls\": 0.17479437589645386, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.16559040546417236, \"percentile_inc_nulls\": 0.16578179597854614, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 550.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.15854257345199585, \"percentile_inc_nulls\": 0.15873563289642334, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.14836424589157104, \"percentile_inc_nulls\": 0.14855962991714478, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.13669443130493164, \"percentile_inc_nulls\": 0.13689249753952026, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.12717169523239136, \"percentile_inc_nulls\": 0.12737196683883667, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.11497735977172852, \"percentile_inc_nulls\": 0.11518043279647827, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.09989839792251587, \"percentile_inc_nulls\": 0.10010486841201782, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0844915509223938, \"percentile_inc_nulls\": 0.08470159769058228, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 940.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.06585592031478882, \"percentile_inc_nulls\": 0.06607019901275635, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.040352702140808105, \"percentile_inc_nulls\": 0.0405728816986084, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00022941827774047852, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2462.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8892185091972351, \"percentile_inc_nulls\": 0.8892439603805542, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8651740550994873, \"percentile_inc_nulls\": 0.8652049899101257, \"value_count\": 1467, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1467.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8483904600143433, \"percentile_inc_nulls\": 0.8484252691268921, \"value_count\": 1024, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1024.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8343932628631592, \"percentile_inc_nulls\": 0.8344312310218811, \"value_count\": 854, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 854.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8206254243850708, \"percentile_inc_nulls\": 0.8206666111946106, \"value_count\": 840, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8072838187217712, \"percentile_inc_nulls\": 0.8073280453681946, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7944011092185974, \"percentile_inc_nulls\": 0.7944482564926147, \"value_count\": 786, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7821412086486816, \"percentile_inc_nulls\": 0.7821912169456482, \"value_count\": 748, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7701107859611511, \"percentile_inc_nulls\": 0.7701635360717773, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7582606673240662, \"percentile_inc_nulls\": 0.758316159248352, \"value_count\": 723, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 723.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7466564178466797, \"percentile_inc_nulls\": 0.7467144727706909, \"value_count\": 708, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 708.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7364616394042969, \"percentile_inc_nulls\": 0.7365221381187439, \"value_count\": 622, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 622.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7267422676086426, \"percentile_inc_nulls\": 0.7268049716949463, \"value_count\": 593, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 593.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7170720100402832, \"percentile_inc_nulls\": 0.7171369791030884, \"value_count\": 590, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7074673771858215, \"percentile_inc_nulls\": 0.7075344920158386, \"value_count\": 586, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6978955268859863, \"percentile_inc_nulls\": 0.6979647874832153, \"value_count\": 584, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.688372790813446, \"percentile_inc_nulls\": 0.6884442567825317, \"value_count\": 581, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6793090105056763, \"percentile_inc_nulls\": 0.6793825626373291, \"value_count\": 553, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 553.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6723759174346924, \"percentile_inc_nulls\": 0.6724510788917542, \"value_count\": 423, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6655576229095459, \"percentile_inc_nulls\": 0.6656343340873718, \"value_count\": 416, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6590179204940796, \"percentile_inc_nulls\": 0.6590961217880249, \"value_count\": 399, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6526912450790405, \"percentile_inc_nulls\": 0.6527709364891052, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6471513509750366, \"percentile_inc_nulls\": 0.6472322940826416, \"value_count\": 338, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6416934728622437, \"percentile_inc_nulls\": 0.6417756080627441, \"value_count\": 333, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6312692165374756, \"percentile_inc_nulls\": 0.6313538551330566, \"value_count\": 318, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.626384973526001, \"percentile_inc_nulls\": 0.626470685005188, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6217301487922668, \"percentile_inc_nulls\": 0.6218169331550598, \"value_count\": 284, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.612518846988678, \"percentile_inc_nulls\": 0.6126077175140381, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6079951524734497, \"percentile_inc_nulls\": 0.608085036277771, \"value_count\": 276, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.603569746017456, \"percentile_inc_nulls\": 0.6036607027053833, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5992263555526733, \"percentile_inc_nulls\": 0.5993183255195618, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5948993563652039, \"percentile_inc_nulls\": 0.5949922800064087, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5908181667327881, \"percentile_inc_nulls\": 0.5909121036529541, \"value_count\": 249, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5869500637054443, \"percentile_inc_nulls\": 0.5870448350906372, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5831148028373718, \"percentile_inc_nulls\": 0.5832104682922363, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.575706422328949, \"percentile_inc_nulls\": 0.5758037567138672, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5720186233520508, \"percentile_inc_nulls\": 0.5721167922019958, \"value_count\": 225, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5685439109802246, \"percentile_inc_nulls\": 0.5686428546905518, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5616599917411804, \"percentile_inc_nulls\": 0.5617605447769165, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.558234453201294, \"percentile_inc_nulls\": 0.5583357810974121, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5548253059387207, \"percentile_inc_nulls\": 0.554927408695221, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5514816641807556, \"percentile_inc_nulls\": 0.5515846014022827, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5482856035232544, \"percentile_inc_nulls\": 0.548389196395874, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5451222658157349, \"percentile_inc_nulls\": 0.5452266335487366, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5419917106628418, \"percentile_inc_nulls\": 0.5420968532562256, \"value_count\": 191, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5389431715011597, \"percentile_inc_nulls\": 0.5390489101409912, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5359601378440857, \"percentile_inc_nulls\": 0.5360665917396545, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.532993495464325, \"percentile_inc_nulls\": 0.5331006050109863, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5302727222442627, \"percentile_inc_nulls\": 0.5303804874420166, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5275847315788269, \"percentile_inc_nulls\": 0.5276931524276733, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5249786972999573, \"percentile_inc_nulls\": 0.5250876545906067, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5224218368530273, \"percentile_inc_nulls\": 0.5225313901901245, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5198976993560791, \"percentile_inc_nulls\": 0.520007848739624, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5173900127410889, \"percentile_inc_nulls\": 0.5175007581710815, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5148987174034119, \"percentile_inc_nulls\": 0.5150099992752075, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5124237537384033, \"percentile_inc_nulls\": 0.5125356316566467, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5099652409553528, \"percentile_inc_nulls\": 0.510077714920044, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5075395107269287, \"percentile_inc_nulls\": 0.5076524615287781, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5051465034484863, \"percentile_inc_nulls\": 0.5052600502967834, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4980167746543884, \"percentile_inc_nulls\": 0.4981319308280945, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48621582984924316, \"percentile_inc_nulls\": 0.4863336682319641, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4838883876800537, \"percentile_inc_nulls\": 0.4840068221092224, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48161017894744873, \"percentile_inc_nulls\": 0.48172909021377563, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47936469316482544, \"percentile_inc_nulls\": 0.4794841408729553, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47713565826416016, \"percentile_inc_nulls\": 0.47725558280944824, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4749557375907898, \"percentile_inc_nulls\": 0.47507619857788086, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47279220819473267, \"percentile_inc_nulls\": 0.4729132056236267, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4706615209579468, \"percentile_inc_nulls\": 0.47078293561935425, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4685635566711426, \"percentile_inc_nulls\": 0.468685507774353, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4664819836616516, \"percentile_inc_nulls\": 0.46660441160202026, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4644496440887451, \"percentile_inc_nulls\": 0.4645724892616272, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4624336361885071, \"percentile_inc_nulls\": 0.46255695819854736, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4604504108428955, \"percentile_inc_nulls\": 0.460574209690094, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45848357677459717, \"percentile_inc_nulls\": 0.4586077928543091, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45458269119262695, \"percentile_inc_nulls\": 0.4547078013420105, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4526650309562683, \"percentile_inc_nulls\": 0.45279061794281006, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45078015327453613, \"percentile_inc_nulls\": 0.4509061574935913, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.445223867893219, \"percentile_inc_nulls\": 0.44535118341445923, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4416508078575134, \"percentile_inc_nulls\": 0.44177889823913574, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4398806691169739, \"percentile_inc_nulls\": 0.4400091767311096, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43812692165374756, \"percentile_inc_nulls\": 0.43825584650039673, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43468499183654785, \"percentile_inc_nulls\": 0.4348146915435791, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4314069151878357, \"percentile_inc_nulls\": 0.431537389755249, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42978429794311523, \"percentile_inc_nulls\": 0.4299151301383972, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.428178071975708, \"percentile_inc_nulls\": 0.42830926179885864, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42499834299087524, \"percentile_inc_nulls\": 0.4251302480697632, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4234248995780945, \"percentile_inc_nulls\": 0.4235571622848511, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4218842387199402, \"percentile_inc_nulls\": 0.42201685905456543, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4203763008117676, \"percentile_inc_nulls\": 0.4205092787742615, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.418884813785553, \"percentile_inc_nulls\": 0.4190181493759155, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4145086407661438, \"percentile_inc_nulls\": 0.41464293003082275, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4131154417991638, \"percentile_inc_nulls\": 0.4132500886917114, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4103618860244751, \"percentile_inc_nulls\": 0.41049718856811523, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40764111280441284, \"percentile_inc_nulls\": 0.40777701139450073, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40495312213897705, \"percentile_inc_nulls\": 0.4050896167755127, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4022979140281677, \"percentile_inc_nulls\": 0.4024350047111511, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4009866714477539, \"percentile_inc_nulls\": 0.40112411975860596, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3983970284461975, \"percentile_inc_nulls\": 0.3985350728034973, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3971513509750366, \"percentile_inc_nulls\": 0.3972896933555603, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3934635519981384, \"percentile_inc_nulls\": 0.39360272884368896, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.39103782176971436, \"percentile_inc_nulls\": 0.39117753505706787, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3863174319267273, \"percentile_inc_nulls\": 0.38645821809768677, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38515371084213257, \"percentile_inc_nulls\": 0.3852947950363159, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38400644063949585, \"percentile_inc_nulls\": 0.3841477632522583, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38061362504959106, \"percentile_inc_nulls\": 0.3807557225227356, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3783845901489258, \"percentile_inc_nulls\": 0.3785271644592285, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3773028254508972, \"percentile_inc_nulls\": 0.37744569778442383, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3741067051887512, \"percentile_inc_nulls\": 0.37425029277801514, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3730577826499939, \"percentile_inc_nulls\": 0.3732016086578369, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.37202519178390503, \"percentile_inc_nulls\": 0.37216925621032715, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3710089921951294, \"percentile_inc_nulls\": 0.3711532950401306, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3690093755722046, \"percentile_inc_nulls\": 0.369154155254364, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3660591244697571, \"percentile_inc_nulls\": 0.36620455980300903, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.36130595207214355, \"percentile_inc_nulls\": 0.36145251989364624, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3594374656677246, \"percentile_inc_nulls\": 0.3595844507217407, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35851961374282837, \"percentile_inc_nulls\": 0.3586667776107788, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35671669244766235, \"percentile_inc_nulls\": 0.3568642735481262, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3549465537071228, \"percentile_inc_nulls\": 0.3550945520401001, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35147184133529663, \"percentile_inc_nulls\": 0.351620614528656, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3489149808883667, \"percentile_inc_nulls\": 0.34906435012817383, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14 values (0.0%) are null and there are 5121 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 6759, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1467, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1024, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 854, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 840, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 814, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 786, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 748, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 734, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 723, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"shoreham\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"allston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"tainan city\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"airport city\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"great neck,\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 6759]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9869236350059509, \"percentile_inc_nulls\": 0.9869236350059509, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9778782725334167, \"percentile_inc_nulls\": 0.9778782725334167, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9721266627311707, \"percentile_inc_nulls\": 0.9721266627311707, \"value_count\": 351, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9664405584335327, \"percentile_inc_nulls\": 0.9664405584335327, \"value_count\": 347, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9614754319190979, \"percentile_inc_nulls\": 0.9614754319190979, \"value_count\": 303, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9569363594055176, \"percentile_inc_nulls\": 0.9569363594055176, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9525775909423828, \"percentile_inc_nulls\": 0.9525775909423828, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9486120939254761, \"percentile_inc_nulls\": 0.9486120939254761, \"value_count\": 242, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9448267817497253, \"percentile_inc_nulls\": 0.9448267817497253, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9411562085151672, \"percentile_inc_nulls\": 0.9411562085151672, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.937616765499115, \"percentile_inc_nulls\": 0.937616765499115, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9341428279876709, \"percentile_inc_nulls\": 0.9341428279876709, \"value_count\": 212, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.930881917476654, \"percentile_inc_nulls\": 0.930881917476654, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9276373982429504, \"percentile_inc_nulls\": 0.9276373982429504, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9246222972869873, \"percentile_inc_nulls\": 0.9246222972869873, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9217710494995117, \"percentile_inc_nulls\": 0.9217710494995117, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9162651896476746, \"percentile_inc_nulls\": 0.9162651896476746, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9136925339698792, \"percentile_inc_nulls\": 0.9136925339698792, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9112181663513184, \"percentile_inc_nulls\": 0.9112181663513184, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9088093638420105, \"percentile_inc_nulls\": 0.9088093638420105, \"value_count\": 147, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9065152406692505, \"percentile_inc_nulls\": 0.9065152406692505, \"value_count\": 140, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9042539000511169, \"percentile_inc_nulls\": 0.9042539000511169, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9021236896514893, \"percentile_inc_nulls\": 0.9021236896514893, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9001245498657227, \"percentile_inc_nulls\": 0.9001245498657227, \"value_count\": 122, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8982564806938171, \"percentile_inc_nulls\": 0.8982564806938171, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8964048027992249, \"percentile_inc_nulls\": 0.8964048027992249, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8946186900138855, \"percentile_inc_nulls\": 0.8946186900138855, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8929636478424072, \"percentile_inc_nulls\": 0.8929636478424072, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8913413882255554, \"percentile_inc_nulls\": 0.8913413882255554, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8897355198860168, \"percentile_inc_nulls\": 0.8897355198860168, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8866220712661743, \"percentile_inc_nulls\": 0.8866220712661743, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8850981593132019, \"percentile_inc_nulls\": 0.8850981593132019, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8836561441421509, \"percentile_inc_nulls\": 0.8836561441421509, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8822305202484131, \"percentile_inc_nulls\": 0.8822305202484131, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8810015320777893, \"percentile_inc_nulls\": 0.8810015320777893, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8798708915710449, \"percentile_inc_nulls\": 0.8798708915710449, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.877707839012146, \"percentile_inc_nulls\": 0.877707839012146, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8766427636146545, \"percentile_inc_nulls\": 0.8766427636146545, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8756431937217712, \"percentile_inc_nulls\": 0.8756431937217712, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8746927380561829, \"percentile_inc_nulls\": 0.8746927380561829, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8737751245498657, \"percentile_inc_nulls\": 0.8737751245498657, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8728902339935303, \"percentile_inc_nulls\": 0.8728902339935303, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8711532950401306, \"percentile_inc_nulls\": 0.8711532950401306, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8703175783157349, \"percentile_inc_nulls\": 0.8703175783157349, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8686789274215698, \"percentile_inc_nulls\": 0.8686789274215698, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8678759932518005, \"percentile_inc_nulls\": 0.8678759932518005, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.866401195526123, \"percentile_inc_nulls\": 0.866401195526123, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8656966090202332, \"percentile_inc_nulls\": 0.8656966090202332, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8650083541870117, \"percentile_inc_nulls\": 0.8650083541870117, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8643528819084167, \"percentile_inc_nulls\": 0.8643528819084167, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.863074779510498, \"percentile_inc_nulls\": 0.863074779510498, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.861829400062561, \"percentile_inc_nulls\": 0.861829400062561, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8600596189498901, \"percentile_inc_nulls\": 0.8600596189498901, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8589125871658325, \"percentile_inc_nulls\": 0.8589125871658325, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.857241153717041, \"percentile_inc_nulls\": 0.857241153717041, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.855618953704834, \"percentile_inc_nulls\": 0.855618953704834, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8545701503753662, \"percentile_inc_nulls\": 0.8545701503753662, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8530462384223938, \"percentile_inc_nulls\": 0.8530462384223938, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8525546789169312, \"percentile_inc_nulls\": 0.8525546789169312, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8511290550231934, \"percentile_inc_nulls\": 0.8511290550231934, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8492937088012695, \"percentile_inc_nulls\": 0.8492937088012695, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8470815420150757, \"percentile_inc_nulls\": 0.8470815420150757, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8453773856163025, \"percentile_inc_nulls\": 0.8453773856163025, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8408710956573486, \"percentile_inc_nulls\": 0.8408710956573486, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8389047384262085, \"percentile_inc_nulls\": 0.8389047384262085, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8373971581459045, \"percentile_inc_nulls\": 0.8373971581459045, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8363156318664551, \"percentile_inc_nulls\": 0.8363156318664551, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8345950841903687, \"percentile_inc_nulls\": 0.8345950841903687, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8319732546806335, \"percentile_inc_nulls\": 0.8319732546806335, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.828859806060791, \"percentile_inc_nulls\": 0.828859806060791, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8262052536010742, \"percentile_inc_nulls\": 0.8262052536010742, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8250909447669983, \"percentile_inc_nulls\": 0.8250909447669983, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8222069144248962, \"percentile_inc_nulls\": 0.8222069144248962, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8190115690231323, \"percentile_inc_nulls\": 0.8190115690231323, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8148821592330933, \"percentile_inc_nulls\": 0.8148821592330933, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8112607598304749, \"percentile_inc_nulls\": 0.8112607598304749, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8055583238601685, \"percentile_inc_nulls\": 0.8055583238601685, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8001507520675659, \"percentile_inc_nulls\": 0.8001507520675659, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7937600612640381, \"percentile_inc_nulls\": 0.7937600612640381, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7862386703491211, \"percentile_inc_nulls\": 0.7862386703491211, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 459.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7773244380950928, \"percentile_inc_nulls\": 0.7773244380950928, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.767918586730957, \"percentile_inc_nulls\": 0.767918586730957, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 574.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.754645586013794, \"percentile_inc_nulls\": 0.754645586013794, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7358830571174622, \"percentile_inc_nulls\": 0.7358830571174622, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1145.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7013404369354248, \"percentile_inc_nulls\": 0.7013404369354248, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.6384655833244324, \"percentile_inc_nulls\": 0.6384655833244324, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3837.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.4765673875808716, \"percentile_inc_nulls\": 0.4765673875808716, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 9880.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29083.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 36703 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 351, \"group_name\": \"_street_address_\", \"value\": \"11 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 347, \"group_name\": \"_street_address_\", \"value\": \"383 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 303, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lk blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"85 broad st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 242, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"co wilmington trust company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial ctr floor 10\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"101 east kennedy blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"7505 floyd ct\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"5972 ne 4th ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"har hotzvim 13 hartom st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"3133 west frye rd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 115,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -940,7 +577,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 25,
    "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
    "metadata": {},
    "outputs": [
@@ -949,23 +586,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed {\n",
+       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed details,\n",
-       "  #altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f.vega-embed details summary {\n",
+       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed details,\n",
+       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\"></div>\n",
+       "<div id=\"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-bcd8a0235c5a4a24b1b0057a8f800b3f\");\n",
+       "    if (outputDiv.id !== \"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1011,14 +648,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9991835355758667, \"percentile_inc_nulls\": 0.9991835355758667, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9984630942344666, \"percentile_inc_nulls\": 0.9984630942344666, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9978387355804443, \"percentile_inc_nulls\": 0.9978387355804443, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9972623586654663, \"percentile_inc_nulls\": 0.9972623586654663, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.996734082698822, \"percentile_inc_nulls\": 0.996734082698822, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 11.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9952932000160217, \"percentile_inc_nulls\": 0.9952932000160217, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9944286942481995, \"percentile_inc_nulls\": 0.9944286942481995, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9917390942573547, \"percentile_inc_nulls\": 0.9917390942573547, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9883770942687988, \"percentile_inc_nulls\": 0.9883770942687988, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9765621423721313, \"percentile_inc_nulls\": 0.9765621423721313, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9439027905464172, \"percentile_inc_nulls\": 0.9439027905464172, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.8651361465454102, \"percentile_inc_nulls\": 0.8651361465454102, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1640.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.725949764251709, \"percentile_inc_nulls\": 0.725949764251709, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2898.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.4753373861312866, \"percentile_inc_nulls\": 0.4753373861312866, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5218.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 14086 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 15, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 12, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 11, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"10 briggs solar ng limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"3880 north mission road solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"41mb 8me limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"4c acquisition limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"59fed wham8 solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 17]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0012776851654052734, \"percentile_inc_nulls\": 0.02391815185546875, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0004914402961730957, \"percentile_inc_nulls\": 0.023149728775024414, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0003439784049987793, \"percentile_inc_nulls\": 0.02300560474395752, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.00024569034576416016, \"percentile_inc_nulls\": 0.022909581661224365, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.022669434547424316, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.8766032457351685, \"percentile_inc_nulls\": 0.8794006109237671, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7900633811950684, \"percentile_inc_nulls\": 0.7948225140571594, \"value_count\": 1761, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1761.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7252445220947266, \"percentile_inc_nulls\": 0.7314730286598206, \"value_count\": 1319, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1319.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6654381155967712, \"percentile_inc_nulls\": 0.6730223894119263, \"value_count\": 1217, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1217.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6124134063720703, \"percentile_inc_nulls\": 0.6211997270584106, \"value_count\": 1079, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1079.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5711337327957153, \"percentile_inc_nulls\": 0.5808558464050293, \"value_count\": 840, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5326551795005798, \"percentile_inc_nulls\": 0.5432496070861816, \"value_count\": 783, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.49756747484207153, \"percentile_inc_nulls\": 0.5089572668075562, \"value_count\": 714, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4656739830970764, \"percentile_inc_nulls\": 0.4777868390083313, \"value_count\": 649, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4348616600036621, \"percentile_inc_nulls\": 0.4476730227470398, \"value_count\": 627, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.410388708114624, \"percentile_inc_nulls\": 0.42375487089157104, \"value_count\": 498, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 498.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3862597942352295, \"percentile_inc_nulls\": 0.4001728892326355, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3641948103904724, \"percentile_inc_nulls\": 0.3786081075668335, \"value_count\": 449, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3430144190788269, \"percentile_inc_nulls\": 0.35790789127349854, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3237505555152893, \"percentile_inc_nulls\": 0.3390807509422302, \"value_count\": 392, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.30483072996139526, \"percentile_inc_nulls\": 0.3205897808074951, \"value_count\": 385, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2861074209213257, \"percentile_inc_nulls\": 0.30229097604751587, \"value_count\": 381, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2678264379501343, \"percentile_inc_nulls\": 0.284424364566803, \"value_count\": 372, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2511671185493469, \"percentile_inc_nulls\": 0.2681427597999573, \"value_count\": 339, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.235687255859375, \"percentile_inc_nulls\": 0.2530137896537781, \"value_count\": 315, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2217307686805725, \"percentile_inc_nulls\": 0.23937368392944336, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.20811831951141357, \"percentile_inc_nulls\": 0.22606980800628662, \"value_count\": 277, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.19470244646072388, \"percentile_inc_nulls\": 0.21295809745788574, \"value_count\": 273, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.1813356876373291, \"percentile_inc_nulls\": 0.19989430904388428, \"value_count\": 272, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.15548676252365112, \"percentile_inc_nulls\": 0.17463135719299316, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.14349597692489624, \"percentile_inc_nulls\": 0.16291242837905884, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.13199663162231445, \"percentile_inc_nulls\": 0.1516737937927246, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.12148016691207886, \"percentile_inc_nulls\": 0.1413956880569458, \"value_count\": 214, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.11145508289337158, \"percentile_inc_nulls\": 0.1315978765487671, \"value_count\": 204, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.10162663459777832, \"percentile_inc_nulls\": 0.12199223041534424, \"value_count\": 200, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.09219127893447876, \"percentile_inc_nulls\": 0.11277073621749878, \"value_count\": 192, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.08457416296005249, \"percentile_inc_nulls\": 0.10532635450363159, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07730108499526978, \"percentile_inc_nulls\": 0.09821814298629761, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07012629508972168, \"percentile_inc_nulls\": 0.09120601415634155, \"value_count\": 146, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.06373775005340576, \"percentile_inc_nulls\": 0.08496230840682983, \"value_count\": 130, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.05759495496749878, \"percentile_inc_nulls\": 0.07895874977111816, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.052091002464294434, \"percentile_inc_nulls\": 0.07357954978942871, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0471767783164978, \"percentile_inc_nulls\": 0.06877672672271729, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.042459070682525635, \"percentile_inc_nulls\": 0.06416600942611694, \"value_count\": 96, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0380362868309021, \"percentile_inc_nulls\": 0.05984342098236084, \"value_count\": 90, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.033662617206573486, \"percentile_inc_nulls\": 0.0555688738822937, \"value_count\": 89, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.029436349868774414, \"percentile_inc_nulls\": 0.05143845081329346, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.02555406093597412, \"percentile_inc_nulls\": 0.047644197940826416, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.021720945835113525, \"percentile_inc_nulls\": 0.04389798641204834, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.018526732921600342, \"percentile_inc_nulls\": 0.04077613353729248, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.01572561264038086, \"percentile_inc_nulls\": 0.03803849220275879, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.013071894645690918, \"percentile_inc_nulls\": 0.03544497489929199, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.010467350482940674, \"percentile_inc_nulls\": 0.03289949893951416, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.006142795085906982, \"percentile_inc_nulls\": 0.028672993183135986, \"value_count\": 44, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.004078805446624756, \"percentile_inc_nulls\": 0.02665579319000244, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.002457141876220703, \"percentile_inc_nulls\": 0.02507084608078003, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 472 values (2.3%) are null and there are 62 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2511, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1761, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1319, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1217, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1079, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 840, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 783, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 714, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 649, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 627, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2511]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9553115367889404, \"percentile_inc_nulls\": 0.9554296135902405, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.9231917858123779, \"percentile_inc_nulls\": 0.9233946800231934, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.89848792552948, \"percentile_inc_nulls\": 0.8987560868263245, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8761436939239502, \"percentile_inc_nulls\": 0.8764708638191223, \"value_count\": 464, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8545699715614319, \"percentile_inc_nulls\": 0.8549541234970093, \"value_count\": 448, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 448.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8359819054603577, \"percentile_inc_nulls\": 0.83641517162323, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8250987529754639, \"percentile_inc_nulls\": 0.8255607485771179, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.815178632736206, \"percentile_inc_nulls\": 0.8156668543815613, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8057401180267334, \"percentile_inc_nulls\": 0.8062533140182495, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7966387271881104, \"percentile_inc_nulls\": 0.797175943851471, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7878744006156921, \"percentile_inc_nulls\": 0.7884347438812256, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7795916199684143, \"percentile_inc_nulls\": 0.7801738977432251, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7714051604270935, \"percentile_inc_nulls\": 0.772009015083313, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7633631825447083, \"percentile_inc_nulls\": 0.7639882564544678, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7555619478225708, \"percentile_inc_nulls\": 0.7562077045440674, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7479052543640137, \"percentile_inc_nulls\": 0.7485711574554443, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7406337261199951, \"percentile_inc_nulls\": 0.7413188219070435, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7338438034057617, \"percentile_inc_nulls\": 0.7345468401908875, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7271501421928406, \"percentile_inc_nulls\": 0.7278709411621094, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7211307287216187, \"percentile_inc_nulls\": 0.7218673229217529, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.715207576751709, \"percentile_inc_nulls\": 0.7159598469734192, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7093807458877563, \"percentile_inc_nulls\": 0.7101483941078186, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7039391398429871, \"percentile_inc_nulls\": 0.704721212387085, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6933448910713196, \"percentile_inc_nulls\": 0.6941549777984619, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6880959272384644, \"percentile_inc_nulls\": 0.6889198422431946, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.677983283996582, \"percentile_inc_nulls\": 0.6788338422775269, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6729750633239746, \"percentile_inc_nulls\": 0.6738389134407043, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6682076454162598, \"percentile_inc_nulls\": 0.6690840721130371, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6634883880615234, \"percentile_inc_nulls\": 0.6643773317337036, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6589136123657227, \"percentile_inc_nulls\": 0.6598145961761475, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6501492857933044, \"percentile_inc_nulls\": 0.6510734558105469, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6459115743637085, \"percentile_inc_nulls\": 0.6468468904495239, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6420109868049622, \"percentile_inc_nulls\": 0.6429566144943237, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6383511424064636, \"percentile_inc_nulls\": 0.6393064260482788, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6313204169273376, \"percentile_inc_nulls\": 0.6322942972183228, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6243860125541687, \"percentile_inc_nulls\": 0.6253782510757446, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6212077140808105, \"percentile_inc_nulls\": 0.6222083568572998, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6180776357650757, \"percentile_inc_nulls\": 0.619086503982544, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6150919795036316, \"percentile_inc_nulls\": 0.616108775138855, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.612250804901123, \"percentile_inc_nulls\": 0.6132750511169434, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6094577312469482, \"percentile_inc_nulls\": 0.6104893684387207, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6041606664657593, \"percentile_inc_nulls\": 0.6052062511444092, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5990561246871948, \"percentile_inc_nulls\": 0.6001152992248535, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5940479636192322, \"percentile_inc_nulls\": 0.5951203107833862, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.591592013835907, \"percentile_inc_nulls\": 0.5926708579063416, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5891842842102051, \"percentile_inc_nulls\": 0.5902694463729858, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5869209170341492, \"percentile_inc_nulls\": 0.5880120992660522, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5847057700157166, \"percentile_inc_nulls\": 0.5858027935028076, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5825387239456177, \"percentile_inc_nulls\": 0.583641529083252, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5783973932266235, \"percentile_inc_nulls\": 0.5795110464096069, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5743522644042969, \"percentile_inc_nulls\": 0.5754766464233398, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.568429172039032, \"percentile_inc_nulls\": 0.5695691704750061, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5626504421234131, \"percentile_inc_nulls\": 0.5638057589530945, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5607724189758301, \"percentile_inc_nulls\": 0.5619326829910278, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5536453723907471, \"percentile_inc_nulls\": 0.5548244714736938, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5519117712974548, \"percentile_inc_nulls\": 0.5530954599380493, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5485408902168274, \"percentile_inc_nulls\": 0.5497334003448486, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.537416934967041, \"percentile_inc_nulls\": 0.5386388301849365, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5327939987182617, \"percentile_inc_nulls\": 0.534028172492981, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5253298282623291, \"percentile_inc_nulls\": 0.526583731174469, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.523885190486908, \"percentile_inc_nulls\": 0.5251429080963135, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5210921764373779, \"percentile_inc_nulls\": 0.5223572254180908, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5156987309455872, \"percentile_inc_nulls\": 0.5169780254364014, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5104979276657104, \"percentile_inc_nulls\": 0.5117909908294678, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5067417621612549, \"percentile_inc_nulls\": 0.5080447196960449, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4995184540748596, \"percentile_inc_nulls\": 0.5008404850959778, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49720698595046997, \"percentile_inc_nulls\": 0.49853515625, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49056148529052734, \"percentile_inc_nulls\": 0.49190717935562134, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4852643609046936, \"percentile_inc_nulls\": 0.4866240620613098, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.47616297006607056, \"percentile_inc_nulls\": 0.47754669189453125, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46845805644989014, \"percentile_inc_nulls\": 0.469862163066864, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46388328075408936, \"percentile_inc_nulls\": 0.46529942750930786, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4595492482185364, \"percentile_inc_nulls\": 0.46097689867019653, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4489068388938904, \"percentile_inc_nulls\": 0.4503626227378845, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.44120198488235474, \"percentile_inc_nulls\": 0.4426780939102173, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.43325626850128174, \"percentile_inc_nulls\": 0.4347533583641052, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.42044687271118164, \"percentile_inc_nulls\": 0.42197781801223755, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4135606288909912, \"percentile_inc_nulls\": 0.4151097536087036, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4025811553001404, \"percentile_inc_nulls\": 0.4041592478752136, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.3919869065284729, \"percentile_inc_nulls\": 0.3935930132865906, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.37465089559555054, \"percentile_inc_nulls\": 0.3763027787208557, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.35991525650024414, \"percentile_inc_nulls\": 0.36160606145858765, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.33757102489471436, \"percentile_inc_nulls\": 0.3393208980560303, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.31869399547576904, \"percentile_inc_nulls\": 0.32049375772476196, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2932678461074829, \"percentile_inc_nulls\": 0.2951347231864929, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.25739187002182007, \"percentile_inc_nulls\": 0.25935351848602295, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 745.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2086583971977234, \"percentile_inc_nulls\": 0.2107487916946411, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.15621691942214966, \"percentile_inc_nulls\": 0.15844577550888062, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.09313303232192993, \"percentile_inc_nulls\": 0.09552854299545288, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1310.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026415586471557617, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1934.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 55 values (0.3%) are null and there are 3879 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 928, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 667, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 513, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 464, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 448, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 386, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 226, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 206, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 196, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 189, \"group_name\": \"_city_\", \"value\": \"nashville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"ft. washington\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"yadkinville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"east longmeadow\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"lebo\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"londonderry\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 928]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.7171125411987305, \"percentile_inc_nulls\": 0.7171125411987305, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6966043710708618, \"percentile_inc_nulls\": 0.6966043710708618, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6680755019187927, \"percentile_inc_nulls\": 0.6680755019187927, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6394985914230347, \"percentile_inc_nulls\": 0.6394985914230347, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6006916165351868, \"percentile_inc_nulls\": 0.6006916165351868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 808.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.548388659954071, \"percentile_inc_nulls\": 0.548388659954071, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.43811535835266113, \"percentile_inc_nulls\": 0.43811535835266113, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2296.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9122.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9712309837341309, \"percentile_inc_nulls\": 0.9712309837341309, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.964026689529419, \"percentile_inc_nulls\": 0.964026689529419, \"value_count\": 150, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9597522020339966, \"percentile_inc_nulls\": 0.9597522020339966, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9559579491615295, \"percentile_inc_nulls\": 0.9559579491615295, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9490418434143066, \"percentile_inc_nulls\": 0.9490418434143066, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.942702054977417, \"percentile_inc_nulls\": 0.942702054977417, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9365544319152832, \"percentile_inc_nulls\": 0.9365544319152832, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9335286617279053, \"percentile_inc_nulls\": 0.9335286617279053, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9305508732795715, \"percentile_inc_nulls\": 0.9305508732795715, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9277172088623047, \"percentile_inc_nulls\": 0.9277172088623047, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9251717329025269, \"percentile_inc_nulls\": 0.9251717329025269, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9202728271484375, \"percentile_inc_nulls\": 0.9202728271484375, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9179674386978149, \"percentile_inc_nulls\": 0.9179674386978149, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9111953973770142, \"percentile_inc_nulls\": 0.9111953973770142, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9089860916137695, \"percentile_inc_nulls\": 0.9089860916137695, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9068248271942139, \"percentile_inc_nulls\": 0.9068248271942139, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9047116041183472, \"percentile_inc_nulls\": 0.9047116041183472, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9027424454689026, \"percentile_inc_nulls\": 0.9027424454689026, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8989001512527466, \"percentile_inc_nulls\": 0.8989001512527466, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8915998339653015, \"percentile_inc_nulls\": 0.8915998339653015, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.886268675327301, \"percentile_inc_nulls\": 0.886268675327301, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8829066753387451, \"percentile_inc_nulls\": 0.8829066753387451, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8813217282295227, \"percentile_inc_nulls\": 0.8813217282295227, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8738773465156555, \"percentile_inc_nulls\": 0.8738773465156555, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8709956407546997, \"percentile_inc_nulls\": 0.8709956407546997, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8696027994155884, \"percentile_inc_nulls\": 0.8696027994155884, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8628787994384766, \"percentile_inc_nulls\": 0.8628787994384766, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8563950061798096, \"percentile_inc_nulls\": 0.8563950061798096, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8526487350463867, \"percentile_inc_nulls\": 0.8526487350463867, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8490466475486755, \"percentile_inc_nulls\": 0.8490466475486755, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8455885648727417, \"percentile_inc_nulls\": 0.8455885648727417, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8411699533462524, \"percentile_inc_nulls\": 0.8411699533462524, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8401133418083191, \"percentile_inc_nulls\": 0.8401133418083191, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8380961418151855, \"percentile_inc_nulls\": 0.8380961418151855, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8332933187484741, \"percentile_inc_nulls\": 0.8332933187484741, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.828730583190918, \"percentile_inc_nulls\": 0.828730583190918, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8218145370483398, \"percentile_inc_nulls\": 0.8218145370483398, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8136496543884277, \"percentile_inc_nulls\": 0.8136496543884277, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8082705140113831, \"percentile_inc_nulls\": 0.8082705140113831, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.800345778465271, \"percentile_inc_nulls\": 0.800345778465271, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7956390380859375, \"percentile_inc_nulls\": 0.7956390380859375, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7844003438949585, \"percentile_inc_nulls\": 0.7844003438949585, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7728735208511353, \"percentile_inc_nulls\": 0.7728735208511353, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7633638978004456, \"percentile_inc_nulls\": 0.7633638978004456, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7494356632232666, \"percentile_inc_nulls\": 0.7494356632232666, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7351711988449097, \"percentile_inc_nulls\": 0.7351711988449097, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 11403 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 425, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 174, \"group_name\": \"_street_address_\", \"value\": \"130 roberts st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 150, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st suite 200\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 89, \"group_name\": \"_street_address_\", \"value\": \"333 washington st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave 35th fl\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"222 2nd ave south suite 1900\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"50101 governors dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"101 summer st 2nd floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 64, \"group_name\": \"_street_address_\", \"value\": \"66 york st 5th floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"300 spectrum ctr dr ste1020\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1310 mackie rd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"26w271 durfree st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"180 hbr dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"964 lebanon church rd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 425]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9991835355758667, \"percentile_inc_nulls\": 0.9991835355758667, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9984630942344666, \"percentile_inc_nulls\": 0.9984630942344666, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9978387355804443, \"percentile_inc_nulls\": 0.9978387355804443, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9972623586654663, \"percentile_inc_nulls\": 0.9972623586654663, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.996734082698822, \"percentile_inc_nulls\": 0.996734082698822, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 11.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9952932000160217, \"percentile_inc_nulls\": 0.9952932000160217, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9944286942481995, \"percentile_inc_nulls\": 0.9944286942481995, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9917390942573547, \"percentile_inc_nulls\": 0.9917390942573547, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9883770942687988, \"percentile_inc_nulls\": 0.9883770942687988, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9765621423721313, \"percentile_inc_nulls\": 0.9765621423721313, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9439027905464172, \"percentile_inc_nulls\": 0.9439027905464172, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.8651361465454102, \"percentile_inc_nulls\": 0.8651361465454102, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1640.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.725949764251709, \"percentile_inc_nulls\": 0.725949764251709, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2898.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.4753373861312866, \"percentile_inc_nulls\": 0.4753373861312866, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5218.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 14086 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 15, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 12, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 11, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"0ham wham8 solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"louisiana energy and power authority\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lost hills solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"long plain solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"los angeles county\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 17]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8766032457351685, \"percentile_inc_nulls\": 0.8794006109237671, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7900633811950684, \"percentile_inc_nulls\": 0.7948225140571594, \"value_count\": 1761, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1761.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7252445220947266, \"percentile_inc_nulls\": 0.7314730286598206, \"value_count\": 1319, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1319.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6654381155967712, \"percentile_inc_nulls\": 0.6730223894119263, \"value_count\": 1217, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1217.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6124134063720703, \"percentile_inc_nulls\": 0.6211997270584106, \"value_count\": 1079, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1079.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5711337327957153, \"percentile_inc_nulls\": 0.5808558464050293, \"value_count\": 840, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5326551795005798, \"percentile_inc_nulls\": 0.5432496070861816, \"value_count\": 783, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.49756747484207153, \"percentile_inc_nulls\": 0.5089572668075562, \"value_count\": 714, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4656739830970764, \"percentile_inc_nulls\": 0.4777868390083313, \"value_count\": 649, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4348616600036621, \"percentile_inc_nulls\": 0.4476730227470398, \"value_count\": 627, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.410388708114624, \"percentile_inc_nulls\": 0.42375487089157104, \"value_count\": 498, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 498.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3862597942352295, \"percentile_inc_nulls\": 0.4001728892326355, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3641948103904724, \"percentile_inc_nulls\": 0.3786081075668335, \"value_count\": 449, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3430144190788269, \"percentile_inc_nulls\": 0.35790789127349854, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3237505555152893, \"percentile_inc_nulls\": 0.3390807509422302, \"value_count\": 392, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.30483072996139526, \"percentile_inc_nulls\": 0.3205897808074951, \"value_count\": 385, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2861074209213257, \"percentile_inc_nulls\": 0.30229097604751587, \"value_count\": 381, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2678264379501343, \"percentile_inc_nulls\": 0.284424364566803, \"value_count\": 372, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2511671185493469, \"percentile_inc_nulls\": 0.2681427597999573, \"value_count\": 339, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.235687255859375, \"percentile_inc_nulls\": 0.2530137896537781, \"value_count\": 315, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2217307686805725, \"percentile_inc_nulls\": 0.23937368392944336, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.20811831951141357, \"percentile_inc_nulls\": 0.22606980800628662, \"value_count\": 277, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.19470244646072388, \"percentile_inc_nulls\": 0.21295809745788574, \"value_count\": 273, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.1813356876373291, \"percentile_inc_nulls\": 0.19989430904388428, \"value_count\": 272, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.15548676252365112, \"percentile_inc_nulls\": 0.17463135719299316, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.14349597692489624, \"percentile_inc_nulls\": 0.16291242837905884, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.13199663162231445, \"percentile_inc_nulls\": 0.1516737937927246, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.12148016691207886, \"percentile_inc_nulls\": 0.1413956880569458, \"value_count\": 214, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.11145508289337158, \"percentile_inc_nulls\": 0.1315978765487671, \"value_count\": 204, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.10162663459777832, \"percentile_inc_nulls\": 0.12199223041534424, \"value_count\": 200, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.09219127893447876, \"percentile_inc_nulls\": 0.11277073621749878, \"value_count\": 192, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.08457416296005249, \"percentile_inc_nulls\": 0.10532635450363159, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07730108499526978, \"percentile_inc_nulls\": 0.09821814298629761, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07012629508972168, \"percentile_inc_nulls\": 0.09120601415634155, \"value_count\": 146, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.06373775005340576, \"percentile_inc_nulls\": 0.08496230840682983, \"value_count\": 130, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.05759495496749878, \"percentile_inc_nulls\": 0.07895874977111816, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.052091002464294434, \"percentile_inc_nulls\": 0.07357954978942871, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0471767783164978, \"percentile_inc_nulls\": 0.06877672672271729, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.042459070682525635, \"percentile_inc_nulls\": 0.06416600942611694, \"value_count\": 96, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0380362868309021, \"percentile_inc_nulls\": 0.05984342098236084, \"value_count\": 90, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.033662617206573486, \"percentile_inc_nulls\": 0.0555688738822937, \"value_count\": 89, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.029436349868774414, \"percentile_inc_nulls\": 0.05143845081329346, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.02555406093597412, \"percentile_inc_nulls\": 0.047644197940826416, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.021720945835113525, \"percentile_inc_nulls\": 0.04389798641204834, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.018526732921600342, \"percentile_inc_nulls\": 0.04077613353729248, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.01572561264038086, \"percentile_inc_nulls\": 0.03803849220275879, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.013071894645690918, \"percentile_inc_nulls\": 0.03544497489929199, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.010467350482940674, \"percentile_inc_nulls\": 0.03289949893951416, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.006142795085906982, \"percentile_inc_nulls\": 0.028672993183135986, \"value_count\": 44, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.004078805446624756, \"percentile_inc_nulls\": 0.02665579319000244, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.002457141876220703, \"percentile_inc_nulls\": 0.02507084608078003, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0012776851654052734, \"percentile_inc_nulls\": 0.02391815185546875, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0004914402961730957, \"percentile_inc_nulls\": 0.023149728775024414, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0003439784049987793, \"percentile_inc_nulls\": 0.02300560474395752, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.00024569034576416016, \"percentile_inc_nulls\": 0.022909581661224365, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.022669434547424316, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 472 values (2.3%) are null and there are 62 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2511, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1761, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1319, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1217, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1079, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 840, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 783, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 714, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 649, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 627, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2511]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9553115367889404, \"percentile_inc_nulls\": 0.9554296135902405, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.9231917858123779, \"percentile_inc_nulls\": 0.9233946800231934, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.89848792552948, \"percentile_inc_nulls\": 0.8987560868263245, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8761436939239502, \"percentile_inc_nulls\": 0.8764708638191223, \"value_count\": 464, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8545699715614319, \"percentile_inc_nulls\": 0.8549541234970093, \"value_count\": 448, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 448.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8359819054603577, \"percentile_inc_nulls\": 0.83641517162323, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8250987529754639, \"percentile_inc_nulls\": 0.8255607485771179, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.815178632736206, \"percentile_inc_nulls\": 0.8156668543815613, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8057401180267334, \"percentile_inc_nulls\": 0.8062533140182495, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7966387271881104, \"percentile_inc_nulls\": 0.797175943851471, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7878744006156921, \"percentile_inc_nulls\": 0.7884347438812256, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7795916199684143, \"percentile_inc_nulls\": 0.7801738977432251, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7714051604270935, \"percentile_inc_nulls\": 0.772009015083313, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7633631825447083, \"percentile_inc_nulls\": 0.7639882564544678, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7555619478225708, \"percentile_inc_nulls\": 0.7562077045440674, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7479052543640137, \"percentile_inc_nulls\": 0.7485711574554443, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7406337261199951, \"percentile_inc_nulls\": 0.7413188219070435, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7338438034057617, \"percentile_inc_nulls\": 0.7345468401908875, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7271501421928406, \"percentile_inc_nulls\": 0.7278709411621094, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7211307287216187, \"percentile_inc_nulls\": 0.7218673229217529, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.715207576751709, \"percentile_inc_nulls\": 0.7159598469734192, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7093807458877563, \"percentile_inc_nulls\": 0.7101483941078186, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7039391398429871, \"percentile_inc_nulls\": 0.704721212387085, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6933448910713196, \"percentile_inc_nulls\": 0.6941549777984619, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6880959272384644, \"percentile_inc_nulls\": 0.6889198422431946, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.677983283996582, \"percentile_inc_nulls\": 0.6788338422775269, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6729750633239746, \"percentile_inc_nulls\": 0.6738389134407043, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6682076454162598, \"percentile_inc_nulls\": 0.6690840721130371, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6634883880615234, \"percentile_inc_nulls\": 0.6643773317337036, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6589136123657227, \"percentile_inc_nulls\": 0.6598145961761475, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6501492857933044, \"percentile_inc_nulls\": 0.6510734558105469, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6459115743637085, \"percentile_inc_nulls\": 0.6468468904495239, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6420109868049622, \"percentile_inc_nulls\": 0.6429566144943237, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6383511424064636, \"percentile_inc_nulls\": 0.6393064260482788, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6313204169273376, \"percentile_inc_nulls\": 0.6322942972183228, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6243860125541687, \"percentile_inc_nulls\": 0.6253782510757446, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6212077140808105, \"percentile_inc_nulls\": 0.6222083568572998, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6180776357650757, \"percentile_inc_nulls\": 0.619086503982544, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6150919795036316, \"percentile_inc_nulls\": 0.616108775138855, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.612250804901123, \"percentile_inc_nulls\": 0.6132750511169434, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6094577312469482, \"percentile_inc_nulls\": 0.6104893684387207, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6041606664657593, \"percentile_inc_nulls\": 0.6052062511444092, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5990561246871948, \"percentile_inc_nulls\": 0.6001152992248535, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5940479636192322, \"percentile_inc_nulls\": 0.5951203107833862, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.591592013835907, \"percentile_inc_nulls\": 0.5926708579063416, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5891842842102051, \"percentile_inc_nulls\": 0.5902694463729858, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5869209170341492, \"percentile_inc_nulls\": 0.5880120992660522, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5847057700157166, \"percentile_inc_nulls\": 0.5858027935028076, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5825387239456177, \"percentile_inc_nulls\": 0.583641529083252, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5783973932266235, \"percentile_inc_nulls\": 0.5795110464096069, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5743522644042969, \"percentile_inc_nulls\": 0.5754766464233398, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.568429172039032, \"percentile_inc_nulls\": 0.5695691704750061, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5626504421234131, \"percentile_inc_nulls\": 0.5638057589530945, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5607724189758301, \"percentile_inc_nulls\": 0.5619326829910278, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5536453723907471, \"percentile_inc_nulls\": 0.5548244714736938, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5519117712974548, \"percentile_inc_nulls\": 0.5530954599380493, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5485408902168274, \"percentile_inc_nulls\": 0.5497334003448486, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.537416934967041, \"percentile_inc_nulls\": 0.5386388301849365, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5327939987182617, \"percentile_inc_nulls\": 0.534028172492981, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5253298282623291, \"percentile_inc_nulls\": 0.526583731174469, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.523885190486908, \"percentile_inc_nulls\": 0.5251429080963135, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5210921764373779, \"percentile_inc_nulls\": 0.5223572254180908, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5156987309455872, \"percentile_inc_nulls\": 0.5169780254364014, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5104979276657104, \"percentile_inc_nulls\": 0.5117909908294678, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5067417621612549, \"percentile_inc_nulls\": 0.5080447196960449, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4995184540748596, \"percentile_inc_nulls\": 0.5008404850959778, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49720698595046997, \"percentile_inc_nulls\": 0.49853515625, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49056148529052734, \"percentile_inc_nulls\": 0.49190717935562134, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4852643609046936, \"percentile_inc_nulls\": 0.4866240620613098, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.47616297006607056, \"percentile_inc_nulls\": 0.47754669189453125, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46845805644989014, \"percentile_inc_nulls\": 0.469862163066864, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46388328075408936, \"percentile_inc_nulls\": 0.46529942750930786, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4595492482185364, \"percentile_inc_nulls\": 0.46097689867019653, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4489068388938904, \"percentile_inc_nulls\": 0.4503626227378845, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.44120198488235474, \"percentile_inc_nulls\": 0.4426780939102173, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.43325626850128174, \"percentile_inc_nulls\": 0.4347533583641052, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.42044687271118164, \"percentile_inc_nulls\": 0.42197781801223755, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4135606288909912, \"percentile_inc_nulls\": 0.4151097536087036, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4025811553001404, \"percentile_inc_nulls\": 0.4041592478752136, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.3919869065284729, \"percentile_inc_nulls\": 0.3935930132865906, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.37465089559555054, \"percentile_inc_nulls\": 0.3763027787208557, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.35991525650024414, \"percentile_inc_nulls\": 0.36160606145858765, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.33757102489471436, \"percentile_inc_nulls\": 0.3393208980560303, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.31869399547576904, \"percentile_inc_nulls\": 0.32049375772476196, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2932678461074829, \"percentile_inc_nulls\": 0.2951347231864929, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.25739187002182007, \"percentile_inc_nulls\": 0.25935351848602295, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 745.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2086583971977234, \"percentile_inc_nulls\": 0.2107487916946411, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.15621691942214966, \"percentile_inc_nulls\": 0.15844577550888062, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.09313303232192993, \"percentile_inc_nulls\": 0.09552854299545288, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1310.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026415586471557617, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1934.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 55 values (0.3%) are null and there are 3879 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 928, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 667, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 513, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 464, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 448, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 386, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 226, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 206, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 196, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 189, \"group_name\": \"_city_\", \"value\": \"nashville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"loma linda\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"combined locks\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"gatlinburg\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"lanai city\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"kissimmee\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 928]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9712309837341309, \"percentile_inc_nulls\": 0.9712309837341309, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.964026689529419, \"percentile_inc_nulls\": 0.964026689529419, \"value_count\": 150, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9597522020339966, \"percentile_inc_nulls\": 0.9597522020339966, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9559579491615295, \"percentile_inc_nulls\": 0.9559579491615295, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9490418434143066, \"percentile_inc_nulls\": 0.9490418434143066, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.942702054977417, \"percentile_inc_nulls\": 0.942702054977417, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9365544319152832, \"percentile_inc_nulls\": 0.9365544319152832, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9335286617279053, \"percentile_inc_nulls\": 0.9335286617279053, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9305508732795715, \"percentile_inc_nulls\": 0.9305508732795715, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9277172088623047, \"percentile_inc_nulls\": 0.9277172088623047, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9251717329025269, \"percentile_inc_nulls\": 0.9251717329025269, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9202728271484375, \"percentile_inc_nulls\": 0.9202728271484375, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9179674386978149, \"percentile_inc_nulls\": 0.9179674386978149, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9111953973770142, \"percentile_inc_nulls\": 0.9111953973770142, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9089860916137695, \"percentile_inc_nulls\": 0.9089860916137695, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9068248271942139, \"percentile_inc_nulls\": 0.9068248271942139, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9047116041183472, \"percentile_inc_nulls\": 0.9047116041183472, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9027424454689026, \"percentile_inc_nulls\": 0.9027424454689026, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8989001512527466, \"percentile_inc_nulls\": 0.8989001512527466, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8915998339653015, \"percentile_inc_nulls\": 0.8915998339653015, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.886268675327301, \"percentile_inc_nulls\": 0.886268675327301, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8829066753387451, \"percentile_inc_nulls\": 0.8829066753387451, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8813217282295227, \"percentile_inc_nulls\": 0.8813217282295227, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8738773465156555, \"percentile_inc_nulls\": 0.8738773465156555, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8709956407546997, \"percentile_inc_nulls\": 0.8709956407546997, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8696027994155884, \"percentile_inc_nulls\": 0.8696027994155884, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8628787994384766, \"percentile_inc_nulls\": 0.8628787994384766, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8563950061798096, \"percentile_inc_nulls\": 0.8563950061798096, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8526487350463867, \"percentile_inc_nulls\": 0.8526487350463867, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8490466475486755, \"percentile_inc_nulls\": 0.8490466475486755, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8455885648727417, \"percentile_inc_nulls\": 0.8455885648727417, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8411699533462524, \"percentile_inc_nulls\": 0.8411699533462524, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8401133418083191, \"percentile_inc_nulls\": 0.8401133418083191, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8380961418151855, \"percentile_inc_nulls\": 0.8380961418151855, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8332933187484741, \"percentile_inc_nulls\": 0.8332933187484741, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.828730583190918, \"percentile_inc_nulls\": 0.828730583190918, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8218145370483398, \"percentile_inc_nulls\": 0.8218145370483398, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8136496543884277, \"percentile_inc_nulls\": 0.8136496543884277, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8082705140113831, \"percentile_inc_nulls\": 0.8082705140113831, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.800345778465271, \"percentile_inc_nulls\": 0.800345778465271, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7956390380859375, \"percentile_inc_nulls\": 0.7956390380859375, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7844003438949585, \"percentile_inc_nulls\": 0.7844003438949585, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7728735208511353, \"percentile_inc_nulls\": 0.7728735208511353, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7633638978004456, \"percentile_inc_nulls\": 0.7633638978004456, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7494356632232666, \"percentile_inc_nulls\": 0.7494356632232666, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7351711988449097, \"percentile_inc_nulls\": 0.7351711988449097, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7171125411987305, \"percentile_inc_nulls\": 0.7171125411987305, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6966043710708618, \"percentile_inc_nulls\": 0.6966043710708618, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6680755019187927, \"percentile_inc_nulls\": 0.6680755019187927, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6394985914230347, \"percentile_inc_nulls\": 0.6394985914230347, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6006916165351868, \"percentile_inc_nulls\": 0.6006916165351868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 808.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.548388659954071, \"percentile_inc_nulls\": 0.548388659954071, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.43811535835266113, \"percentile_inc_nulls\": 0.43811535835266113, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2296.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9122.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 11403 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 425, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 174, \"group_name\": \"_street_address_\", \"value\": \"130 roberts st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 150, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st suite 200\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 89, \"group_name\": \"_street_address_\", \"value\": \"333 washington st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave 35th fl\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"222 2nd ave south suite 1900\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"101 summer st 2nd floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"50101 governors dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 64, \"group_name\": \"_street_address_\", \"value\": \"9405 arrowpoint blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"13915 kimberly\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"220 w main sreet\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2404 15th streetpo box 988\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"po box 2000\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1255 23rd st nw ste 300\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 425]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 116,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1037,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 26,
    "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
    "metadata": {},
    "outputs": [
@@ -1051,7 +688,7 @@
        " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
       ]
      },
-     "execution_count": 117,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1071,7 +708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 27,
    "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
    "metadata": {},
    "outputs": [
@@ -1135,7 +772,7 @@
        "2  FRST      816       36        29376"
       ]
      },
-     "execution_count": 118,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1154,7 +791,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 28,
    "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
    "metadata": {},
    "outputs": [
@@ -1163,23 +800,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed {\n",
+       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed details,\n",
-       "  #altair-viz-7213d070f2cd42878025324dddfeb43b.vega-embed details summary {\n",
+       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed details,\n",
+       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-7213d070f2cd42878025324dddfeb43b\"></div>\n",
+       "<div id=\"altair-viz-992b4c7852b74b80835e3a88352b4008\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-7213d070f2cd42878025324dddfeb43b\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-7213d070f2cd42878025324dddfeb43b\");\n",
+       "    if (outputDiv.id !== \"altair-viz-992b4c7852b74b80835e3a88352b4008\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-992b4c7852b74b80835e3a88352b4008\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1232,7 +869,7 @@
        "alt.Chart(...)"
       ]
      },
-     "execution_count": 121,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1257,50 +894,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 334,
-   "id": "cb8b02b2-50a1-4525-9516-eecdf9a145db",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# NOT USED\n",
-    "company_name_comparison = cl.CustomComparison(\n",
-    "    comparison_levels = [\n",
-    "        cll.NullLevel(\"company_name\"),\n",
-    "        cll.ExactMatchLevel(\"company_name\"),\n",
-    "        # cll.ExactMatchLevel(\"company_name_no_legal\"),\n",
-    "        # cll.LevenshteinLevel(\"company_name\", distance_threshold=1),\n",
-    "        cll.JaroWinklerLevel(\"company_name_no_legal\", distance_threshold=.95),\n",
-    "        # cll.ArraySubsetLevel(\"company_name_mphone_list\"),\n",
-    "        cll.ArrayIntersectLevel(\"company_name_mphone_list\", min_intersection=3)\n",
-    "    ],\n",
-    "    output_column_name=\"company_name\",\n",
-    "    comparison_description=None\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 422,
-   "id": "d2e043ed-7f64-4547-992d-7f947a63db6d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# NOT USED\n",
-    "address_comparison = cl.CustomComparison(\n",
-    "    comparison_levels = [\n",
-    "        cll.NullLevel(\"street_address\"),\n",
-    "        cll.ExactMatchLevel(\"street_address\"),\n",
-    "        cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n",
-    "        cll.ArraySubsetLevel(\"street_address_list\"),\n",
-    "    ],\n",
-    "    output_column_name=\"street_address\",\n",
-    "    comparison_description=None\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 29,
    "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
    "metadata": {},
    "outputs": [
@@ -1324,7 +918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 30,
    "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
    "metadata": {},
    "outputs": [
@@ -1348,7 +942,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 31,
    "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794",
    "metadata": {},
    "outputs": [
@@ -1371,7 +965,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 32,
    "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6",
    "metadata": {},
    "outputs": [
@@ -1395,7 +989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 33,
    "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
    "metadata": {},
    "outputs": [],
@@ -1418,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 34,
    "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
    "metadata": {},
    "outputs": [
@@ -1437,7 +1031,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 35,
    "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
    "metadata": {},
    "outputs": [
@@ -1451,7 +1045,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f4e8733639644336a9a29f9b599af513",
+       "model_id": "c4bcd9c2605a413aab003a2484a4a006",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1465,7 +1059,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2fc66d179b9a430795b4ec68a164c22e",
+       "model_id": "b15bb7a15e37447ba1366278db3ab2bd",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1497,7 +1091,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 36,
    "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
    "metadata": {},
    "outputs": [
@@ -1528,7 +1122,7 @@
       "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n",
       "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n",
       "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n",
-      "Iteration 4: Largest change in params was 0.000537 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.000535 in probability_two_random_records_match\n",
       "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n",
       "\n",
       "EM converged after 5 iterations\n",
@@ -1549,7 +1143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 37,
    "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
    "metadata": {},
    "outputs": [
@@ -1572,16 +1166,16 @@
       "    - street_address\n",
       "\n",
       "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n",
-      "Iteration 2: Largest change in params was 0.477 in probability_two_random_records_match\n",
-      "Iteration 3: Largest change in params was 0.0395 in probability_two_random_records_match\n",
-      "Iteration 4: Largest change in params was 0.0443 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 5: Largest change in params was 0.0195 in probability_two_random_records_match\n",
-      "Iteration 6: Largest change in params was 0.00733 in probability_two_random_records_match\n",
-      "Iteration 7: Largest change in params was 0.00275 in probability_two_random_records_match\n",
-      "Iteration 8: Largest change in params was 0.00105 in probability_two_random_records_match\n",
-      "Iteration 9: Largest change in params was 0.0004 in probability_two_random_records_match\n",
+      "Iteration 2: Largest change in params was 0.476 in probability_two_random_records_match\n",
+      "Iteration 3: Largest change in params was 0.0397 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.0442 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 5: Largest change in params was 0.0194 in probability_two_random_records_match\n",
+      "Iteration 6: Largest change in params was 0.00729 in probability_two_random_records_match\n",
+      "Iteration 7: Largest change in params was 0.00274 in probability_two_random_records_match\n",
+      "Iteration 8: Largest change in params was 0.00104 in probability_two_random_records_match\n",
+      "Iteration 9: Largest change in params was 0.000398 in probability_two_random_records_match\n",
       "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n",
-      "Iteration 11: Largest change in params was 5.9e-05 in probability_two_random_records_match\n",
+      "Iteration 11: Largest change in params was 5.88e-05 in probability_two_random_records_match\n",
       "\n",
       "EM converged after 11 iterations\n",
       "\n",
@@ -1598,7 +1192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 38,
    "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13",
    "metadata": {},
    "outputs": [
@@ -1607,23 +1201,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed {\n",
+       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed details,\n",
-       "  #altair-viz-860ba1d851fa4b559933e2ae8a6d5f81.vega-embed details summary {\n",
+       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed details,\n",
+       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\"></div>\n",
+       "<div id=\"altair-viz-185f3e4a9af6415baed71f7c69036c99\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-860ba1d851fa4b559933e2ae8a6d5f81\");\n",
+       "    if (outputDiv.id !== \"altair-viz-185f3e4a9af6415baed71f7c69036c99\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-185f3e4a9af6415baed71f7c69036c99\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1669,14 +1263,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-20, 20]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-64b98266126531a5fb88840b22d4f48f\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-64b98266126531a5fb88840b22d4f48f\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.3743083676072958e-06, \"log2_bayes_factor\": -18.684061249539493, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  421,176.3 records.This is equivalent to a starting match weight of -18.684.\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 7.756e-05% of records (i.e. one in 1,289,362) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.002325464819968982, \"u_probability\": 4.0767514280959686e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.077e-05% of records (i.e. one in 2,452,933) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5704.210474895404, \"log2_bayes_factor\": 12.477811500224687, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 5,704 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001162% of records (i.e. one in 86,031) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916612, \"u_probability\": 2.0015855182334595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002002% of records (i.e. one in 49,960) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 425.7988077590869, \"log2_bayes_factor\": 8.734028100010068, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 426 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816566831130892, \"log2_bayes_factor\": -0.18171111483340682, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.112% of records (i.e. one in 19.56) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.188553543221709, \"u_probability\": 0.9488813638600871, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.304) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871139891995104, \"log2_bayes_factor\": -2.3312534608825977, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6331% of records (i.e. one in 158) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021461003233482275, \"u_probability\": 0.0005131439788678606, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05131% of records (i.e. one in 1,949) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 41.822576347541414, \"log2_bayes_factor\": 5.386210032217432, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 41.82 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29455979898988677, \"u_probability\": 0.993155557685305, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.32% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.29658979070348424, \"log2_bayes_factor\": -1.7534591570912872, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.372 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-740da21ad061123ec94a64fd1de6c98f\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-740da21ad061123ec94a64fd1de6c98f\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.3743083676072958e-06, \"log2_bayes_factor\": -18.684061249539493, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  421,176.3 records.This is equivalent to a starting match weight of -18.684.\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058150813342962, \"u_probability\": 1.2180592034555192e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001218% of records (i.e. one in 820,978) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 415263.13326917647, \"log2_bayes_factor\": 18.663666270565923, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 415,263 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023254626281341145, \"u_probability\": 3.8253098951495645e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.825e-05% of records (i.e. one in 2,614,167) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6079.148335361709, \"log2_bayes_factor\": 12.569653506618183, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,079 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860443747032734, \"u_probability\": 0.9999983994098071, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860459529587555, \"log2_bayes_factor\": -0.020273212421214715, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485090659, \"u_probability\": 1.143566326549975e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001144% of records (i.e. one in 87,446) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9605.781693687017, \"log2_bayes_factor\": 13.229687306123738, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,606 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272915167, \"u_probability\": 2.119020349813114e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002119% of records (i.e. one in 47,192) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 402.20129427576404, \"log2_bayes_factor\": 8.651773913989402, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 402 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761782, \"u_probability\": 0.9999673741332363, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816575527180243, \"log2_bayes_factor\": -0.18170969185989266, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.8114435216387779, \"u_probability\": 0.052535716053222686, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.254% of records (i.e. one in 19.03) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.445559375582198, \"log2_bayes_factor\": 3.949120215368288, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.45 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.1885564783612222, \"u_probability\": 0.9474642839467773, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.75% of records (i.e. one in 1.055) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19901170055272938, \"log2_bayes_factor\": -2.3290748408362094, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.025 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839727297704452, \"u_probability\": 0.0067046866658963065, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6705% of records (i.e. one in 149) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 102.01412293426084, \"log2_bayes_factor\": 6.672625083900734, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 102 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021460503551289617, \"u_probability\": 0.0005311505676706402, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05312% of records (i.e. one in 1,883) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 40.40380422712267, \"log2_bayes_factor\": 5.336419231458947, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 40.4 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2945667666782652, \"u_probability\": 0.9927641627664331, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.28% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2967137389986223, \"log2_bayes_factor\": -1.7528563641383663, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.37 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 131,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1687,7 +1281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 39,
    "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3",
    "metadata": {},
    "outputs": [
@@ -1696,23 +1290,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed {\n",
+       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed details,\n",
-       "  #altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd.vega-embed details summary {\n",
+       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed details,\n",
+       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\"></div>\n",
+       "<div id=\"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-6202cf8c985a4c3cb08581ec3f06c2bd\");\n",
+       "    if (outputDiv.id !== \"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1758,14 +1352,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-dc74bccc7251002cb1499c8a0408d184\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-dc74bccc7251002cb1499c8a0408d184\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151845954096, \"u_probability\": 7.755771009548427e-07, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 7.756e-05% of records (i.e. one in 1,289,362) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 652179.1114934688, \"log2_bayes_factor\": 19.314908708489483, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 652,179 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.002325464819968982, \"u_probability\": 4.0767514280959686e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 4.077e-05% of records (i.e. one in 2,452,933) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 5704.210474895404, \"log2_bayes_factor\": 12.477811500224687, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 5,704 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860441659892117, \"u_probability\": 0.9999988167477563, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860453327295641, \"log2_bayes_factor\": -0.020274119885879474, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092523, \"u_probability\": 1.1623713218156555e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001162% of records (i.e. one in 86,031) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9450.378101150924, \"log2_bayes_factor\": 13.20615633608501, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,450 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916612, \"u_probability\": 2.0015855182334595e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002002% of records (i.e. one in 49,960) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 425.7988077590869, \"log2_bayes_factor\": 8.734028100010068, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 426 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761582, \"u_probability\": 0.9999683604315995, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816566831130892, \"log2_bayes_factor\": -0.18171111483340682, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811446456778291, \"u_probability\": 0.05111863613991284, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.112% of records (i.e. one in 19.56) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.873789248941307, \"log2_bayes_factor\": 3.9885746514233986, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.188553543221709, \"u_probability\": 0.9488813638600871, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.304) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871139891995104, \"log2_bayes_factor\": -2.3312534608825977, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839791977766309, \"u_probability\": 0.006331298335827164, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6331% of records (i.e. one in 158) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 108.03142759932369, \"log2_bayes_factor\": 6.755307259996993, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 108 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021461003233482275, \"u_probability\": 0.0005131439788678606, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05131% of records (i.e. one in 1,949) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 41.822576347541414, \"log2_bayes_factor\": 5.386210032217432, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 41.82 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29455979898988677, \"u_probability\": 0.993155557685305, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.32% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.29658979070348424, \"log2_bayes_factor\": -1.7534591570912872, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.372 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-da21a3ec309ec8fce463d576250e1f0d\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-da21a3ec309ec8fce463d576250e1f0d\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058150813342962, \"u_probability\": 1.2180592034555192e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001218% of records (i.e. one in 820,978) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 415263.13326917647, \"log2_bayes_factor\": 18.663666270565923, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 415,263 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023254626281341145, \"u_probability\": 3.8253098951495645e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.825e-05% of records (i.e. one in 2,614,167) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6079.148335361709, \"log2_bayes_factor\": 12.569653506618183, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,079 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860443747032734, \"u_probability\": 0.9999983994098071, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860459529587555, \"log2_bayes_factor\": -0.020273212421214715, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485090659, \"u_probability\": 1.143566326549975e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001144% of records (i.e. one in 87,446) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9605.781693687017, \"log2_bayes_factor\": 13.229687306123738, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,606 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272915167, \"u_probability\": 2.119020349813114e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002119% of records (i.e. one in 47,192) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 402.20129427576404, \"log2_bayes_factor\": 8.651773913989402, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 402 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761782, \"u_probability\": 0.9999673741332363, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816575527180243, \"log2_bayes_factor\": -0.18170969185989266, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.8114435216387779, \"u_probability\": 0.052535716053222686, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.254% of records (i.e. one in 19.03) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.445559375582198, \"log2_bayes_factor\": 3.949120215368288, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.45 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.1885564783612222, \"u_probability\": 0.9474642839467773, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.75% of records (i.e. one in 1.055) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19901170055272938, \"log2_bayes_factor\": -2.3290748408362094, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.025 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839727297704452, \"u_probability\": 0.0067046866658963065, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6705% of records (i.e. one in 149) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 102.01412293426084, \"log2_bayes_factor\": 6.672625083900734, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 102 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021460503551289617, \"u_probability\": 0.0005311505676706402, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05312% of records (i.e. one in 1,883) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 40.40380422712267, \"log2_bayes_factor\": 5.336419231458947, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 40.4 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2945667666782652, \"u_probability\": 0.9927641627664331, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.28% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2967137389986223, \"log2_bayes_factor\": -1.7528563641383663, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.37 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.HConcatChart(...)"
       ]
      },
-     "execution_count": 132,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1781,8 +1375,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# you could save the model weights like this\n",
     "settings = linker.misc.save_model_to_json(\n",
-    "    \"model_unsupervised_city_state_0.json\", overwrite=True\n",
+    "    \"model_unsupervised_0.json\", overwrite=True\n",
     ")"
    ]
   },
@@ -1796,7 +1391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 40,
    "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
    "metadata": {},
    "outputs": [
@@ -1805,19 +1400,17 @@
      "output_type": "stream",
      "text": [
       "Blocking time: 0.16 seconds\n",
-      "Predict time: 0.31 seconds\n"
+      "Predict time: 0.26 seconds\n"
      ]
     }
    ],
    "source": [
-    "# it's helpful to keep threshold at .5 just to see what makes it into blocking\n",
-    "# df_predictions = linker.inference.predict(threshold_match_probability=0.5)\n",
     "df_predictions = linker.inference.predict()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 41,
    "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
    "metadata": {},
    "outputs": [],
@@ -1827,7 +1420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 42,
    "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
    "metadata": {},
    "outputs": [
@@ -1894,202 +1487,202 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>295287</th>\n",
-       "      <td>-22.970759</td>\n",
-       "      <td>1.216501e-07</td>\n",
+       "      <td>-22.967975</td>\n",
+       "      <td>1.218850e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>9829</td>\n",
-       "      <td>3043</td>\n",
-       "      <td>capitol bancorp</td>\n",
-       "      <td>capital power</td>\n",
+       "      <td>56230</td>\n",
+       "      <td>19078</td>\n",
+       "      <td>union pacific</td>\n",
+       "      <td>union electric</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.986045</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.000098</td>\n",
+       "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>capitol bancorp ctr</td>\n",
-       "      <td>120010423 101 st nw</td>\n",
+       "      <td>1416 dodge st</td>\n",
+       "      <td>mc 1400</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.000110</td>\n",
-       "      <td>0.881657</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.881658</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>mi</td>\n",
-       "      <td>ab</td>\n",
+       "      <td>ne</td>\n",
+       "      <td>mo</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.015147</td>\n",
-       "      <td>0.000197</td>\n",
-       "      <td>0.198711</td>\n",
+       "      <td>0.006455</td>\n",
+       "      <td>0.010118</td>\n",
+       "      <td>0.199012</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>lansing</td>\n",
-       "      <td>edmonton</td>\n",
+       "      <td>omaha</td>\n",
+       "      <td>st louis</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000293</td>\n",
-       "      <td>0.000428</td>\n",
-       "      <td>0.296590</td>\n",
+       "      <td>0.003448</td>\n",
+       "      <td>0.002764</td>\n",
+       "      <td>0.296714</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>KPTL BNKRP</td>\n",
-       "      <td>KPTL PWR</td>\n",
+       "      <td>UNN PSFK</td>\n",
+       "      <td>UNN ELKTRK</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>383898</th>\n",
-       "      <td>-22.970759</td>\n",
-       "      <td>1.216501e-07</td>\n",
+       "      <th>384509</th>\n",
+       "      <td>-22.967975</td>\n",
+       "      <td>1.218850e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>51783</td>\n",
-       "      <td>17550</td>\n",
-       "      <td>state bancorp</td>\n",
-       "      <td>state street bank and trust</td>\n",
+       "      <td>56484</td>\n",
+       "      <td>19138</td>\n",
+       "      <td>united states lime and minerals</td>\n",
+       "      <td>united water conservation</td>\n",
        "      <td>0</td>\n",
+       "      <td>0.000037</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.986045</td>\n",
+       "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>2 jericho plz</td>\n",
-       "      <td>100 summer st</td>\n",
+       "      <td>5429 lbj fwy</td>\n",
+       "      <td>1701 north lombard st</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000012</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.881657</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.881658</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ny</td>\n",
-       "      <td>ma</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>ca</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.120228</td>\n",
-       "      <td>0.041765</td>\n",
-       "      <td>0.198711</td>\n",
+       "      <td>0.079841</td>\n",
+       "      <td>0.157960</td>\n",
+       "      <td>0.199012</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>jericho</td>\n",
-       "      <td>boston</td>\n",
+       "      <td>dallas</td>\n",
+       "      <td>oxnard</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000306</td>\n",
-       "      <td>0.014319</td>\n",
-       "      <td>0.296590</td>\n",
+       "      <td>0.013855</td>\n",
+       "      <td>0.000257</td>\n",
+       "      <td>0.296714</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>STT BNKRP</td>\n",
-       "      <td>STT STRT BNK ANT TRST</td>\n",
+       "      <td>UNTT STTS LM ANT MNRLS</td>\n",
+       "      <td>UNTT WTR KNSRFXN</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>383897</th>\n",
-       "      <td>-22.970759</td>\n",
-       "      <td>1.216501e-07</td>\n",
+       "      <th>384504</th>\n",
+       "      <td>-22.967975</td>\n",
+       "      <td>1.218850e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>51782</td>\n",
-       "      <td>17550</td>\n",
-       "      <td>state auto financial</td>\n",
-       "      <td>state street bank and trust</td>\n",
+       "      <td>56436</td>\n",
+       "      <td>19138</td>\n",
+       "      <td>united rentals</td>\n",
+       "      <td>united water conservation</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.986045</td>\n",
+       "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>518 east broad st</td>\n",
-       "      <td>100 summer st</td>\n",
+       "      <td>100 first stamford pl</td>\n",
+       "      <td>1701 north lombard st</td>\n",
        "      <td>0</td>\n",
+       "      <td>0.000122</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.881657</td>\n",
+       "      <td>0.881658</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>oh</td>\n",
-       "      <td>ma</td>\n",
+       "      <td>ct</td>\n",
+       "      <td>ca</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.016991</td>\n",
-       "      <td>0.041765</td>\n",
-       "      <td>0.198711</td>\n",
+       "      <td>0.020876</td>\n",
+       "      <td>0.157960</td>\n",
+       "      <td>0.199012</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>columbus</td>\n",
-       "      <td>boston</td>\n",
+       "      <td>stamford</td>\n",
+       "      <td>oxnard</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.002788</td>\n",
-       "      <td>0.014319</td>\n",
-       "      <td>0.296590</td>\n",
+       "      <td>0.003950</td>\n",
+       "      <td>0.000257</td>\n",
+       "      <td>0.296714</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>STT AT FNNXL</td>\n",
-       "      <td>STT STRT BNK ANT TRST</td>\n",
+       "      <td>UNTT RNTLS</td>\n",
+       "      <td>UNTT WTR KNSRFXN</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>383896</th>\n",
-       "      <td>-22.970759</td>\n",
-       "      <td>1.216501e-07</td>\n",
+       "      <th>384503</th>\n",
+       "      <td>-22.967975</td>\n",
+       "      <td>1.218850e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>51781</td>\n",
-       "      <td>17550</td>\n",
-       "      <td>state auto financial</td>\n",
-       "      <td>state street bank and trust</td>\n",
+       "      <td>56424</td>\n",
+       "      <td>19138</td>\n",
+       "      <td>united parcel service</td>\n",
+       "      <td>united water conservation</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.986045</td>\n",
+       "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>518 e broad st</td>\n",
-       "      <td>100 summer st</td>\n",
+       "      <td>55 glenlake pkwy ne</td>\n",
+       "      <td>1701 north lombard st</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.881657</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.881658</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>oh</td>\n",
-       "      <td>ma</td>\n",
+       "      <td>ga</td>\n",
+       "      <td>ca</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.016991</td>\n",
-       "      <td>0.041765</td>\n",
-       "      <td>0.198711</td>\n",
+       "      <td>0.018626</td>\n",
+       "      <td>0.157960</td>\n",
+       "      <td>0.199012</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>columbus</td>\n",
-       "      <td>boston</td>\n",
+       "      <td>atlanta</td>\n",
+       "      <td>oxnard</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.002788</td>\n",
-       "      <td>0.014319</td>\n",
-       "      <td>0.296590</td>\n",
+       "      <td>0.008462</td>\n",
+       "      <td>0.000257</td>\n",
+       "      <td>0.296714</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>STT AT FNNXL</td>\n",
-       "      <td>STT STRT BNK ANT TRST</td>\n",
+       "      <td>UNTT PRSL SRFS</td>\n",
+       "      <td>UNTT WTR KNSRFXN</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>383895</th>\n",
-       "      <td>-22.970759</td>\n",
-       "      <td>1.216501e-07</td>\n",
+       "      <th>384502</th>\n",
+       "      <td>-22.967975</td>\n",
+       "      <td>1.218850e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>51780</td>\n",
-       "      <td>3805</td>\n",
-       "      <td>starz</td>\n",
-       "      <td>citrus world</td>\n",
+       "      <td>56312</td>\n",
+       "      <td>19138</td>\n",
+       "      <td>united bancorp /oh/</td>\n",
+       "      <td>united water conservation</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.000049</td>\n",
-       "      <td>0.986045</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>8900 liberty cir</td>\n",
-       "      <td>20205 hwy 2720205 hwy 27</td>\n",
+       "      <td>201 south fourth st</td>\n",
+       "      <td>1701 north lombard st</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.881657</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.881658</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>co</td>\n",
-       "      <td>fl</td>\n",
+       "      <td>oh</td>\n",
+       "      <td>ca</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.023802</td>\n",
-       "      <td>0.048477</td>\n",
-       "      <td>0.198711</td>\n",
+       "      <td>0.016991</td>\n",
+       "      <td>0.157960</td>\n",
+       "      <td>0.199012</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>englewood</td>\n",
-       "      <td>lake wales</td>\n",
+       "      <td>martins ferry</td>\n",
+       "      <td>oxnard</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.002947</td>\n",
-       "      <td>0.000049</td>\n",
-       "      <td>0.296590</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000257</td>\n",
+       "      <td>0.296714</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>STRS</td>\n",
-       "      <td>STRS WRLT</td>\n",
+       "      <td>UNTT BNKRP</td>\n",
+       "      <td>UNTT WTR KNSRFXN</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -2133,8 +1726,8 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>186872</th>\n",
-       "      <td>27.519625</td>\n",
+       "      <th>163815</th>\n",
+       "      <td>27.519606</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2145,36 +1738,36 @@
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.010580</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.016616</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.317122</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.311992</td>\n",
        "      <td>sd</td>\n",
        "      <td>sd</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001930</td>\n",
        "      <td>0.001930</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>26.483035</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>27.217182</td>\n",
        "      <td>huron</td>\n",
        "      <td>huron</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>86.293486</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>91.382644</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>580681</th>\n",
-       "      <td>27.526533</td>\n",
+       "      <th>241593</th>\n",
+       "      <td>27.526514</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2185,36 +1778,36 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.021160</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.033231</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.317122</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.311992</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>33.262692</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>34.184780</td>\n",
        "      <td>colchester</td>\n",
        "      <td>colchester</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000183</td>\n",
        "      <td>0.000183</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>34.517394</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>36.553058</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>438193</th>\n",
-       "      <td>27.757357</td>\n",
+       "      <th>165487</th>\n",
+       "      <td>27.757338</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2225,36 +1818,36 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.031739</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.049847</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.475683</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
        "      <td>wi</td>\n",
        "      <td>wi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.008840</td>\n",
        "      <td>0.008840</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>5.782805</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>5.943112</td>\n",
        "      <td>wausau</td>\n",
        "      <td>wausau</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>103.552183</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>109.659173</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>385934</th>\n",
-       "      <td>27.884385</td>\n",
+       "      <th>340414</th>\n",
+       "      <td>27.884365</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2265,36 +1858,36 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.031739</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.049847</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.475683</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
        "      <td>mo</td>\n",
        "      <td>mo</td>\n",
        "      <td>1</td>\n",
        "      <td>0.010118</td>\n",
        "      <td>0.010118</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>5.052049</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>5.192099</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000049</td>\n",
        "      <td>0.000049</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>129.440229</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>137.073967</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>503816</th>\n",
-       "      <td>29.211031</td>\n",
+       "      <th>274760</th>\n",
+       "      <td>29.211012</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2305,29 +1898,29 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.021160</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.033231</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.475683</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>33.262692</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>34.184780</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000086</td>\n",
        "      <td>0.000086</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>73.965845</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>78.327981</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>0</td>\n",
@@ -2338,23 +1931,23 @@
        "</div>"
       ],
       "text/plain": [
-       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r      company_name_no_legal_l      company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal     street_address_l          street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l  company_name_mphone_r match_key\n",
-       "295287    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1         9829         3043              capitol bancorp                capital power                            0                    0.000024                    0.000012                  0.986045                         1.000000  capitol bancorp ctr       120010423 101 st nw                     0             0.000012             0.000110           0.881657                  1.000000      mi      ab            0    0.015147    0.000197   0.198711         1.000000      lansing     edmonton           0   0.000293   0.000428    0.296590        1.000000            KPTL BNKRP               KPTL PWR         0\n",
-       "383898    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51783        17550                state bancorp  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000        2 jericho plz             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      ny      ma            0    0.120228    0.041765   0.198711         1.000000      jericho       boston           0   0.000306   0.014319    0.296590        1.000000             STT BNKRP  STT STRT BNK ANT TRST         0\n",
-       "383897    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51782        17550         state auto financial  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000    518 east broad st             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      oh      ma            0    0.016991    0.041765   0.198711         1.000000     columbus       boston           0   0.002788   0.014319    0.296590        1.000000          STT AT FNNXL  STT STRT BNK ANT TRST         0\n",
-       "383896    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51781        17550         state auto financial  state street bank and trust                            0                    0.000024                    0.000024                  0.986045                         1.000000       518 e broad st             100 summer st                     0             0.000012             0.000024           0.881657                  1.000000      oh      ma            0    0.016991    0.041765   0.198711         1.000000     columbus       boston           0   0.002788   0.014319    0.296590        1.000000          STT AT FNNXL  STT STRT BNK ANT TRST         0\n",
-       "383895    -22.970759       1.216501e-07  __splink__input_table_0  __splink__input_table_1        51780         3805                        starz                 citrus world                            0                    0.000024                    0.000049                  0.986045                         1.000000     8900 liberty cir  20205 hwy 2720205 hwy 27                     0             0.000024             0.000012           0.881657                  1.000000      co      fl            0    0.023802    0.048477   0.198711         1.000000    englewood   lake wales           0   0.002947   0.000049    0.296590        1.000000                  STRS              STRS WRLT         0\n",
-       "...              ...                ...                      ...                      ...          ...          ...                          ...                          ...                          ...                         ...                         ...                       ...                              ...                  ...                       ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...         ...             ...                   ...                    ...       ...\n",
-       "186872     27.519625       1.000000e+00  __splink__input_table_0  __splink__input_table_1        39816        13109  northwestern public service  northwestern public service                            2                    0.000073                    0.000073             652179.111493                         0.010580       33 third st se            33 third st se                     2             0.000037             0.000037        9450.378101                  0.317122      sd      sd            1    0.001930    0.001930  15.873789        26.483035        huron        huron           2   0.000073   0.000073  108.031428       86.293486    NR0WSTRN PBLK SRFS     NR0WSTRN PBLK SRFS         0\n",
-       "580681     27.526533       1.000000e+00  __splink__input_table_0  __splink__input_table_1        24650         8047         green mountain power         green mountain power                            2                    0.000037                    0.000037             652179.111493                         0.021160         163 acorn ln              163 acorn ln                     2             0.000037             0.000037        9450.378101                  0.317122      vt      vt            1    0.001537    0.001537  15.873789        33.262692   colchester   colchester           2   0.000183   0.000183  108.031428       34.517394          KRN MNTN PWR           KRN MNTN PWR         0\n",
-       "438193     27.757357       1.000000e+00  __splink__input_table_0  __splink__input_table_1        58842        19906           wausau paper mills           wausau paper mills                            2                    0.000024                    0.000024             652179.111493                         0.031739        one clarks is             one clarks is                     2             0.000024             0.000024        9450.378101                  0.475683      wi      wi            1    0.008840    0.008840  15.873789         5.782805       wausau       wausau           2   0.000061   0.000061  108.031428      103.552183            WS PPR MLS             WS PPR MLS         0\n",
-       "385934     27.884385       1.000000e+00  __splink__input_table_0  __splink__input_table_1        51567        17450    st joseph light and power    st joseph light and power                            2                    0.000024                    0.000024             652179.111493                         0.031739       520 francis st            520 francis st                     2             0.000024             0.000024        9450.378101                  0.475683      mo      mo            1    0.010118    0.010118  15.873789         5.052049    st joseph    st joseph           2   0.000049   0.000049  108.031428      129.440229     ST JSF LT ANT PWR      ST JSF LT ANT PWR         0\n",
-       "503816     29.211031       1.000000e+00  __splink__input_table_0  __splink__input_table_1        20588         6741                    fibermark                    fibermark                            2                    0.000037                    0.000037             652179.111493                         0.021160    161 wellington rd         161 wellington rd                     2             0.000024             0.000024        9450.378101                  0.475683      vt      vt            1    0.001537    0.001537  15.873789        33.262692  brattleboro  brattleboro           2   0.000086   0.000086  108.031428       73.965845                FBRMRK                 FBRMRK         0\n",
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r          company_name_no_legal_l      company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state         city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city   company_name_mphone_l company_name_mphone_r match_key\n",
+       "295287    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56230        19078                    union pacific               union electric                            0                    0.000049                    0.000098                  0.986046                         1.000000          1416 dodge st                mc 1400                     0             0.000049             0.000049           0.881658                  1.000000      ne      mo            0    0.006455    0.010118   0.199012         1.000000          omaha     st louis           0   0.003448   0.002764    0.296714        1.000000                UNN PSFK            UNN ELKTRK         0\n",
+       "384509    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56484        19138  united states lime and minerals    united water conservation                            0                    0.000037                    0.000024                  0.986046                         1.000000           5429 lbj fwy  1701 north lombard st                     0             0.000024             0.000012           0.881658                  1.000000      tx      ca            0    0.079841    0.157960   0.199012         1.000000         dallas       oxnard           0   0.013855   0.000257    0.296714        1.000000  UNTT STTS LM ANT MNRLS      UNTT WTR KNSRFXN         0\n",
+       "384504    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56436        19138                   united rentals    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000  100 first stamford pl  1701 north lombard st                     0             0.000122             0.000012           0.881658                  1.000000      ct      ca            0    0.020876    0.157960   0.199012         1.000000       stamford       oxnard           0   0.003950   0.000257    0.296714        1.000000              UNTT RNTLS      UNTT WTR KNSRFXN         0\n",
+       "384503    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56424        19138            united parcel service    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000    55 glenlake pkwy ne  1701 north lombard st                     0             0.000012             0.000012           0.881658                  1.000000      ga      ca            0    0.018626    0.157960   0.199012         1.000000        atlanta       oxnard           0   0.008462   0.000257    0.296714        1.000000          UNTT PRSL SRFS      UNTT WTR KNSRFXN         0\n",
+       "384502    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56312        19138              united bancorp /oh/    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000    201 south fourth st  1701 north lombard st                     0             0.000012             0.000012           0.881658                  1.000000      oh      ca            0    0.016991    0.157960   0.199012         1.000000  martins ferry       oxnard           0   0.000024   0.000257    0.296714        1.000000             UNTT BNKRP       UNTT WTR KNSRFXN         0\n",
+       "...              ...                ...                      ...                      ...          ...          ...                              ...                          ...                          ...                         ...                         ...                       ...                              ...                    ...                    ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...            ...          ...         ...        ...        ...         ...             ...                     ...                   ...       ...\n",
+       "163815     27.519606       1.000000e+00  __splink__input_table_0  __splink__input_table_1        39816        13109      northwestern public service  northwestern public service                            2                    0.000073                    0.000073             415263.133269                         0.016616         33 third st se         33 third st se                     2             0.000037             0.000037        9605.781694                  0.311992      sd      sd            1    0.001930    0.001930  15.445559        27.217182          huron        huron           2   0.000073   0.000073  102.014123       91.382644      NR0WSTRN PBLK SRFS    NR0WSTRN PBLK SRFS         0\n",
+       "241593     27.526514       1.000000e+00  __splink__input_table_0  __splink__input_table_1        24650         8047             green mountain power         green mountain power                            2                    0.000037                    0.000037             415263.133269                         0.033231           163 acorn ln           163 acorn ln                     2             0.000037             0.000037        9605.781694                  0.311992      vt      vt            1    0.001537    0.001537  15.445559        34.184780     colchester   colchester           2   0.000183   0.000183  102.014123       36.553058            KRN MNTN PWR          KRN MNTN PWR         0\n",
+       "165487     27.757338       1.000000e+00  __splink__input_table_0  __splink__input_table_1        58842        19906               wausau paper mills           wausau paper mills                            2                    0.000024                    0.000024             415263.133269                         0.049847          one clarks is          one clarks is                     2             0.000024             0.000024        9605.781694                  0.467987      wi      wi            1    0.008840    0.008840  15.445559         5.943112         wausau       wausau           2   0.000061   0.000061  102.014123      109.659173              WS PPR MLS            WS PPR MLS         0\n",
+       "340414     27.884365       1.000000e+00  __splink__input_table_0  __splink__input_table_1        51567        17450        st joseph light and power    st joseph light and power                            2                    0.000024                    0.000024             415263.133269                         0.049847         520 francis st         520 francis st                     2             0.000024             0.000024        9605.781694                  0.467987      mo      mo            1    0.010118    0.010118  15.445559         5.192099      st joseph    st joseph           2   0.000049   0.000049  102.014123      137.073967       ST JSF LT ANT PWR     ST JSF LT ANT PWR         0\n",
+       "274760     29.211012       1.000000e+00  __splink__input_table_0  __splink__input_table_1        20588         6741                        fibermark                    fibermark                            2                    0.000037                    0.000037             415263.133269                         0.033231      161 wellington rd      161 wellington rd                     2             0.000024             0.000024        9605.781694                  0.467987      vt      vt            1    0.001537    0.001537  15.445559        34.184780    brattleboro  brattleboro           2   0.000086   0.000086  102.014123       78.327981                  FBRMRK                FBRMRK         0\n",
        "\n",
        "[590575 rows x 37 columns]"
       ]
      },
-     "execution_count": 135,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2365,7 +1958,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 43,
    "id": "c0b292c8-26ed-407a-866e-75851577d567",
    "metadata": {},
    "outputs": [],
@@ -2379,7 +1972,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 44,
    "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0",
    "metadata": {},
    "outputs": [],
@@ -2392,7 +1985,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 45,
    "id": "5103190c-3775-427f-a8f2-cc8a8f79892b",
    "metadata": {},
    "outputs": [],
@@ -2404,7 +1997,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 46,
    "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8",
    "metadata": {},
    "outputs": [
@@ -2476,9 +2069,9 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>466134</th>\n",
-       "      <td>3.824596</td>\n",
-       "      <td>0.934073</td>\n",
+       "      <th>218797</th>\n",
+       "      <td>3.824578</td>\n",
+       "      <td>0.934072</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
        "      <td>14692</td>\n",
@@ -2488,29 +2081,29 @@
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>100 first stamford pl</td>\n",
        "      <td>100 first stamford pl</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000122</td>\n",
        "      <td>0.000122</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.095137</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.093597</td>\n",
        "      <td>ct</td>\n",
        "      <td>ct</td>\n",
        "      <td>1</td>\n",
        "      <td>0.020876</td>\n",
        "      <td>0.020876</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>2.448667</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>2.516547</td>\n",
        "      <td>stamford</td>\n",
        "      <td>stamford</td>\n",
        "      <td>2</td>\n",
        "      <td>0.003950</td>\n",
        "      <td>0.003950</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>1.602975</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>1.697510</td>\n",
        "      <td>KRN</td>\n",
        "      <td>ENTRJ NKLR PWR MRKTNK</td>\n",
        "      <td>1</td>\n",
@@ -2522,8 +2115,8 @@
        "      <td>55243</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>466594</th>\n",
-       "      <td>4.620005</td>\n",
+       "      <th>220036</th>\n",
+       "      <td>4.619987</td>\n",
        "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2534,29 +2127,29 @@
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>one energy plz</td>\n",
        "      <td>one energy plz</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000330</td>\n",
        "      <td>0.000330</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.035236</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.034666</td>\n",
        "      <td>mi</td>\n",
        "      <td>mi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.015147</td>\n",
        "      <td>0.015147</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>3.374867</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>3.468423</td>\n",
        "      <td>detroit</td>\n",
        "      <td>detroit</td>\n",
        "      <td>2</td>\n",
        "      <td>0.001162</td>\n",
        "      <td>0.001162</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>5.450115</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>5.771535</td>\n",
        "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
        "      <td>TT SSTNBL JNRXN</td>\n",
        "      <td>1</td>\n",
@@ -2568,8 +2161,8 @@
        "      <td>64331</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>480747</th>\n",
-       "      <td>4.620005</td>\n",
+       "      <th>358152</th>\n",
+       "      <td>4.619987</td>\n",
        "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2580,29 +2173,29 @@
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.0</td>\n",
        "      <td>one energy plz</td>\n",
        "      <td>one energy plz</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000330</td>\n",
        "      <td>0.000330</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.035236</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.034666</td>\n",
        "      <td>mi</td>\n",
        "      <td>mi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.015147</td>\n",
        "      <td>0.015147</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>3.374867</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>3.468423</td>\n",
        "      <td>detroit</td>\n",
        "      <td>detroit</td>\n",
        "      <td>2</td>\n",
        "      <td>0.001162</td>\n",
        "      <td>0.001162</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>5.450115</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>5.771535</td>\n",
        "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
        "      <td>TT ELKTRK</td>\n",
        "      <td>0</td>\n",
@@ -2613,408 +2206,29 @@
        "      <td>5522</td>\n",
        "      <td>5109</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>464506</th>\n",
-       "      <td>6.019599</td>\n",
-       "      <td>0.984820</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>14051</td>\n",
-       "      <td>10935</td>\n",
-       "      <td>constellation energy</td>\n",
-       "      <td>luminace solar rhode island</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1310 pt st</td>\n",
-       "      <td>1310 pt st</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.475683</td>\n",
-       "      <td>md</td>\n",
-       "      <td>md</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.025130</td>\n",
-       "      <td>0.025130</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>2.034167</td>\n",
-       "      <td>baltimore</td>\n",
-       "      <td>baltimore</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.003583</td>\n",
-       "      <td>0.003583</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>1.767102</td>\n",
-       "      <td>KNSTLXN ENRJ</td>\n",
-       "      <td>LMNS SLR RHT ISLNT</td>\n",
-       "      <td>1</td>\n",
-       "      <td>14051</td>\n",
-       "      <td>0001868275</td>\n",
-       "      <td>0001868275</td>\n",
-       "      <td>constellation energy corp</td>\n",
-       "      <td>10935</td>\n",
-       "      <td>62679</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>340973</th>\n",
-       "      <td>6.201744</td>\n",
-       "      <td>0.986596</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>14051</td>\n",
-       "      <td>4420</td>\n",
-       "      <td>constellation energy</td>\n",
-       "      <td>constellation newenergy</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>5704.210475</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>1310 pt st</td>\n",
-       "      <td>100 constellation way</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000183</td>\n",
-       "      <td>0.881657</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>md</td>\n",
-       "      <td>md</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.025130</td>\n",
-       "      <td>0.025130</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>2.034167</td>\n",
-       "      <td>baltimore</td>\n",
-       "      <td>baltimore</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.003583</td>\n",
-       "      <td>0.003583</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>1.767102</td>\n",
-       "      <td>KNSTLXN ENRJ</td>\n",
-       "      <td>KNSTLXN NWNRJ</td>\n",
-       "      <td>0</td>\n",
-       "      <td>14051</td>\n",
-       "      <td>0001868275</td>\n",
-       "      <td>0001868275</td>\n",
-       "      <td>constellation energy corp</td>\n",
-       "      <td>4420</td>\n",
-       "      <td>58491</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>464642</th>\n",
-       "      <td>5.308053</td>\n",
-       "      <td>0.975380</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>6561</td>\n",
-       "      <td>air products and chemicals /de/</td>\n",
-       "      <td>exelon gen extexlaporte</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.095137</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>1.738226</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>6.314158</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
-       "      <td>EKSLN JN EKSTKSLPRT</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>air products &amp; chemicals inc /de/</td>\n",
-       "      <td>6561</td>\n",
-       "      <td>6081</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>227094</th>\n",
-       "      <td>20.402617</td>\n",
-       "      <td>0.999999</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>1586</td>\n",
-       "      <td>430</td>\n",
-       "      <td>air products and chemicals</td>\n",
-       "      <td>air products and chemicals</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000037</td>\n",
-       "      <td>0.000037</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.021160</td>\n",
-       "      <td>1940 air products blvd</td>\n",
-       "      <td>1940 air products blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000049</td>\n",
-       "      <td>0.000049</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.237842</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>1.738226</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>6.314158</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1586</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>air products &amp; chemicals, inc.</td>\n",
-       "      <td>430</td>\n",
-       "      <td>991</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>224504</th>\n",
-       "      <td>5.308053</td>\n",
-       "      <td>0.975380</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>435</td>\n",
-       "      <td>air products and chemicals /de/</td>\n",
-       "      <td>air products</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000037</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.095137</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>1.738226</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>6.314158</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
-       "      <td>AR PRTKTS</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>air products &amp; chemicals inc /de/</td>\n",
-       "      <td>435</td>\n",
-       "      <td>980</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>225982</th>\n",
-       "      <td>5.308053</td>\n",
-       "      <td>0.975380</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>432</td>\n",
-       "      <td>air products and chemicals /de/</td>\n",
-       "      <td>air products energy enterprises</td>\n",
-       "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.986045</td>\n",
-       "      <td>1.000000</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>7201 hamilton blvd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>0.000122</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.095137</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>pa</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>0.029409</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>1.738226</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>allentown</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>0.001003</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>6.314158</td>\n",
-       "      <td>AR PRTKTS ANT XMKLS T</td>\n",
-       "      <td>AR PRTKTS ENRJ ENTRPRSS</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1585</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>0000002969</td>\n",
-       "      <td>air products &amp; chemicals inc /de/</td>\n",
-       "      <td>432</td>\n",
-       "      <td>353</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>224473</th>\n",
-       "      <td>20.054878</td>\n",
-       "      <td>0.999999</td>\n",
-       "      <td>__splink__input_table_0</td>\n",
-       "      <td>__splink__input_table_1</td>\n",
-       "      <td>1348</td>\n",
-       "      <td>376</td>\n",
-       "      <td>aetna life and casualty</td>\n",
-       "      <td>aetna life and casualty</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>652179.111493</td>\n",
-       "      <td>0.031739</td>\n",
-       "      <td>151 farmington ave</td>\n",
-       "      <td>151 farmington ave</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000110</td>\n",
-       "      <td>0.000110</td>\n",
-       "      <td>9450.378101</td>\n",
-       "      <td>0.105707</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>1</td>\n",
-       "      <td>0.020876</td>\n",
-       "      <td>0.020876</td>\n",
-       "      <td>15.873789</td>\n",
-       "      <td>2.448667</td>\n",
-       "      <td>hartford</td>\n",
-       "      <td>hartford</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.001198</td>\n",
-       "      <td>0.001198</td>\n",
-       "      <td>108.031428</td>\n",
-       "      <td>5.283275</td>\n",
-       "      <td>ETN LF ANT KSLT</td>\n",
-       "      <td>ETN LF ANT KSLT</td>\n",
-       "      <td>0</td>\n",
-       "      <td>1348</td>\n",
-       "      <td>0000002648</td>\n",
-       "      <td>0000002648</td>\n",
-       "      <td>aetna life &amp; casualty co</td>\n",
-       "      <td>376</td>\n",
-       "      <td>211</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>2085 rows × 43 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
-       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                company_name_no_legal_l          company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal        street_address_l        street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state     city_l     city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city      company_name_mphone_l    company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                           company_name_raw  record_id_y  utility_id_eia\n",
-       "466134      3.824596           0.934073  __splink__input_table_0  __splink__input_table_1        14692         6293                                  crane  entergy nuclear power marketing                            0                    0.000012                    0.000012                  0.986045                         1.000000   100 first stamford pl   100 first stamford pl                     2             0.000122             0.000122        9450.378101                  0.095137      ct      ct            1    0.020876    0.020876  15.873789         2.448667   stamford   stamford           2   0.003950   0.003950  108.031428        1.602975                        KRN    ENTRJ NKLR PWR MRKTNK         1        14692     0001944013        0001944013                                   crane co         6293           55243\n",
-       "466594      4.620005           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5535  dte electric securitization funding i       dte sustainable generation                            0                    0.000012                    0.000012                  0.986045                         1.000000          one energy plz          one energy plz                     2             0.000330             0.000330        9450.378101                  0.035236      mi      mi            1    0.015147    0.015147  15.873789         3.374867    detroit    detroit           2   0.001162   0.001162  108.031428        5.450115  TT ELKTRK SKRTSXN FNTNK I          TT SSTNBL JNRXN         1        17752     0001876068        0001876068  dte electric securitization funding i llc         5535           64331\n",
-       "480747      4.620005           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5522  dte electric securitization funding i                     dte electric                            0                    0.000012                    0.000037                  0.986045                         1.000000          one energy plz          one energy plz                     2             0.000330             0.000330        9450.378101                  0.035236      mi      mi            1    0.015147    0.015147  15.873789         3.374867    detroit    detroit           2   0.001162   0.001162  108.031428        5.450115  TT ELKTRK SKRTSXN FNTNK I                TT ELKTRK         0        17752     0001876068        0001876068  dte electric securitization funding i llc         5522            5109\n",
-       "464506      6.019599           0.984820  __splink__input_table_0  __splink__input_table_1        14051        10935                   constellation energy      luminace solar rhode island                            0                    0.000024                    0.000024                  0.986045                         1.000000              1310 pt st              1310 pt st                     2             0.000024             0.000024        9450.378101                  0.475683      md      md            1    0.025130    0.025130  15.873789         2.034167  baltimore  baltimore           2   0.003583   0.003583  108.031428        1.767102               KNSTLXN ENRJ       LMNS SLR RHT ISLNT         1        14051     0001868275        0001868275                  constellation energy corp        10935           62679\n",
-       "340973      6.201744           0.986596  __splink__input_table_0  __splink__input_table_1        14051         4420                   constellation energy          constellation newenergy                            1                    0.000024                    0.000024               5704.210475                         1.000000              1310 pt st   100 constellation way                     0             0.000024             0.000183           0.881657                  1.000000      md      md            1    0.025130    0.025130  15.873789         2.034167  baltimore  baltimore           2   0.003583   0.003583  108.031428        1.767102               KNSTLXN ENRJ            KNSTLXN NWNRJ         0        14051     0001868275        0001868275                  constellation energy corp         4420           58491\n",
-       "...              ...                ...                      ...                      ...          ...          ...                                    ...                              ...                          ...                         ...                         ...                       ...                              ...                     ...                     ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...        ...        ...         ...        ...        ...         ...             ...                        ...                      ...       ...          ...            ...               ...                                        ...          ...             ...\n",
-       "464642      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585         6561        air products and chemicals /de/          exelon gen extexlaporte                            0                    0.000024                    0.000012                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T      EKSLN JN EKSTKSLPRT         1         1585     0000002969        0000002969          air products & chemicals inc /de/         6561            6081\n",
-       "227094     20.402617           0.999999  __splink__input_table_0  __splink__input_table_1         1586          430             air products and chemicals       air products and chemicals                            2                    0.000037                    0.000037             652179.111493                         0.021160  1940 air products blvd  1940 air products blvd                     2             0.000049             0.000049        9450.378101                  0.237842      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158        AR PRTKTS ANT XMKLS      AR PRTKTS ANT XMKLS         0         1586     0000002969        0000002969             air products & chemicals, inc.          430             991\n",
-       "224504      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585          435        air products and chemicals /de/                     air products                            0                    0.000024                    0.000037                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T                AR PRTKTS         0         1585     0000002969        0000002969          air products & chemicals inc /de/          435             980\n",
-       "225982      5.308053           0.975380  __splink__input_table_0  __splink__input_table_1         1585          432        air products and chemicals /de/  air products energy enterprises                            0                    0.000024                    0.000012                  0.986045                         1.000000      7201 hamilton blvd      7201 hamilton blvd                     2             0.000122             0.000122        9450.378101                  0.095137      pa      pa            1    0.029409    0.029409  15.873789         1.738226  allentown  allentown           2   0.001003   0.001003  108.031428        6.314158      AR PRTKTS ANT XMKLS T  AR PRTKTS ENRJ ENTRPRSS         0         1585     0000002969        0000002969          air products & chemicals inc /de/          432             353\n",
-       "224473     20.054878           0.999999  __splink__input_table_0  __splink__input_table_1         1348          376                aetna life and casualty          aetna life and casualty                            2                    0.000024                    0.000024             652179.111493                         0.031739      151 farmington ave      151 farmington ave                     2             0.000110             0.000110        9450.378101                  0.105707      ct      ct            1    0.020876    0.020876  15.873789         2.448667   hartford   hartford           2   0.001198   0.001198  108.031428        5.283275            ETN LF ANT KSLT          ETN LF ANT KSLT         0         1348     0000002648        0000002648                   aetna life & casualty co          376             211\n",
-       "\n",
-       "[2085 rows x 43 columns]"
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                company_name_no_legal_l          company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state    city_l    city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city      company_name_mphone_l  company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                           company_name_raw  record_id_y  utility_id_eia\n",
+       "218797      3.824578           0.934072  __splink__input_table_0  __splink__input_table_1        14692         6293                                  crane  entergy nuclear power marketing                            0                    0.000012                    0.000012                  0.986046                              1.0  100 first stamford pl  100 first stamford pl                     2             0.000122             0.000122        9605.781694                  0.093597      ct      ct            1    0.020876    0.020876  15.445559         2.516547  stamford  stamford           2   0.003950   0.003950  102.014123        1.697510                        KRN  ENTRJ NKLR PWR MRKTNK         1        14692     0001944013        0001944013                                   crane co         6293           55243\n",
+       "220036      4.619987           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5535  dte electric securitization funding i       dte sustainable generation                            0                    0.000012                    0.000012                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9605.781694                  0.034666      mi      mi            1    0.015147    0.015147  15.445559         3.468423   detroit   detroit           2   0.001162   0.001162  102.014123        5.771535  TT ELKTRK SKRTSXN FNTNK I        TT SSTNBL JNRXN         1        17752     0001876068        0001876068  dte electric securitization funding i llc         5535           64331\n",
+       "358152      4.619987           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5522  dte electric securitization funding i                     dte electric                            0                    0.000012                    0.000037                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9605.781694                  0.034666      mi      mi            1    0.015147    0.015147  15.445559         3.468423   detroit   detroit           2   0.001162   0.001162  102.014123        5.771535  TT ELKTRK SKRTSXN FNTNK I              TT ELKTRK         0        17752     0001876068        0001876068  dte electric securitization funding i llc         5522            5109"
       ]
      },
-     "execution_count": 139,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "preds_validation_df[preds_validation_df.match_probability > .9]"
+    "preds_validation_df[preds_validation_df.match_probability > .9].head(3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 47,
    "id": "11190456-12a9-49df-b863-7a6f674e39eb",
    "metadata": {},
    "outputs": [],
@@ -3024,7 +2238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 48,
    "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81",
    "metadata": {},
    "outputs": [],
@@ -3034,7 +2248,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 49,
    "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b",
    "metadata": {},
    "outputs": [],
@@ -3049,7 +2263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 50,
    "id": "4d45f339-7a5b-466a-81f5-c71e425a77df",
    "metadata": {},
    "outputs": [],
@@ -3059,7 +2273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 51,
    "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5",
    "metadata": {},
    "outputs": [],
@@ -3072,7 +2286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 52,
    "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5",
    "metadata": {},
    "outputs": [
@@ -3162,7 +2376,7 @@
        "      <td>1</td>\n",
        "      <td>13310.0</td>\n",
        "      <td>4281.0</td>\n",
-       "      <td>0.999981</td>\n",
+       "      <td>0.999982</td>\n",
        "      <td>1.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -3176,7 +2390,7 @@
        "      <td>1</td>\n",
        "      <td>17793.0</td>\n",
        "      <td>5564.0</td>\n",
-       "      <td>0.927294</td>\n",
+       "      <td>0.927293</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3216,7 +2430,7 @@
        "      <td>southern co</td>\n",
        "      <td>southern co services inc</td>\n",
        "      <td>0</td>\n",
-       "      <td>50962.0</td>\n",
+       "      <td>50964.0</td>\n",
        "      <td>17068.0</td>\n",
        "      <td>0.007216</td>\n",
        "      <td>0.0</td>\n",
@@ -3274,7 +2488,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6780.0</td>\n",
-       "      <td>0.986543</td>\n",
+       "      <td>0.986542</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -3288,7 +2502,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6763.0</td>\n",
-       "      <td>0.085467</td>\n",
+       "      <td>0.085466</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3358,7 +2572,7 @@
        "      <td>0</td>\n",
        "      <td>40084.0</td>\n",
        "      <td>13240.0</td>\n",
-       "      <td>0.300167</td>\n",
+       "      <td>0.300165</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3372,7 +2586,7 @@
        "      <td>1</td>\n",
        "      <td>40084.0</td>\n",
        "      <td>13243.0</td>\n",
-       "      <td>0.999820</td>\n",
+       "      <td>0.999813</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -3414,7 +2628,7 @@
        "      <td>1</td>\n",
        "      <td>49303.0</td>\n",
        "      <td>16270.0</td>\n",
-       "      <td>0.559074</td>\n",
+       "      <td>0.559071</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3426,7 +2640,7 @@
        "      <td>american electric power co inc</td>\n",
        "      <td>American Electric Power Inc</td>\n",
        "      <td>1</td>\n",
-       "      <td>2926.0</td>\n",
+       "      <td>2927.0</td>\n",
        "      <td>793.0</td>\n",
        "      <td>0.996076</td>\n",
        "      <td>2.0</td>\n",
@@ -3456,30 +2670,30 @@
        "0         0000003153             195                 alabama power co                                    NaN      1       1701.0        478.0           1.000000                          2.0       both              1.0\n",
        "1         0001868941           58702             fluence energy, inc.                                Fluence      0      21792.0       6889.0           0.016529                          0.0       both              0.0\n",
        "2         0000041091            7140                 georgia power co                                    NaN      1      23416.0       7653.0           0.999997                          2.0       both              1.0\n",
-       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1      13310.0       4281.0           0.999981                          1.0       both              1.0\n",
-       "4         0001326160            5416                 duke energy corp                                    NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
+       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1      13310.0       4281.0           0.999982                          1.0       both              1.0\n",
+       "4         0001326160            5416                 duke energy corp                                    NaN      1      17793.0       5564.0           0.927293                          2.0       both              0.0\n",
        "5         0000030371           54905       duke energy carolinas, llc              Duke Energy Carolinas LLC      1      17790.0       5558.0           0.999987                          2.0       both              1.0\n",
        "6         0000869446           57140      berkshire realty co inc /de  Berkshire Wind Power Cooperative Corp      0       7449.0       1712.0           0.001912                          0.0       both              0.0\n",
-       "7         0000092122           18195                      southern co               southern co services inc      0      50962.0      17068.0           0.007216                          0.0       both              0.0\n",
+       "7         0000092122           18195                      southern co               southern co services inc      0      50964.0      17068.0           0.007216                          0.0       both              0.0\n",
        "8         0000092122           17650                      southern co                      Southern Power Co      0      50963.0      17089.0           0.034232                          0.0       both              0.0\n",
        "9         0000075488           14328        pacific gas & electric co                                    NaN      1      41598.0      13933.0           0.999948                          2.0       both              1.0\n",
        "10        0001031296            6526                 firstenergy corp                            FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
-       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
-       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0      21579.0       6763.0           0.085467                          0.0       both              0.0\n",
+       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986542                          0.0       both              1.0\n",
+       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0      21579.0       6763.0           0.085466                          0.0       both              0.0\n",
        "13        0001031296           32208                 firstenergy corp                      First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
        "14        0000100122           24211         tucson electric power co                                    NaN      1      55725.0      18901.0           1.000000                          2.0       both              1.0\n",
        "15        0000096271           18454                tampa electric co                                    NaN      1      53604.0      18180.0           0.991059                          2.0       both              1.0\n",
        "16        0000715957            5248             dominion energy, inc                                    NaN      1      17484.0       5386.0           0.999985                          2.0       both              1.0\n",
-       "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0      40084.0      13240.0           0.300167                          0.0       both              0.0\n",
-       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1      40084.0      13243.0           0.999820                          2.0       both              1.0\n",
+       "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0      40084.0      13240.0           0.300165                          0.0       both              0.0\n",
+       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1      40084.0      13243.0           0.999813                          2.0       both              1.0\n",
        "19        0000788816           13994            oglethorpe power corp                                    NaN      1      40576.0      13515.0           1.000000                          2.0       both              1.0\n",
        "20        0000018675            3266           central maine power co                                    NaN      1      10876.0       3424.0           1.000000                          2.0       both              1.0\n",
-       "21        0001032208           61296                    sempra energy                      Sempra Generation      1      49303.0      16270.0           0.559074                          0.0       both              0.0\n",
-       "22        0000004904             488   american electric power co inc            American Electric Power Inc      1       2926.0        793.0           0.996076                          2.0       both              1.0\n",
+       "21        0001032208           61296                    sempra energy                      Sempra Generation      1      49303.0      16270.0           0.559071                          0.0       both              0.0\n",
+       "22        0000004904             488   american electric power co inc            American Electric Power Inc      1       2927.0        793.0           0.996076                          2.0       both              1.0\n",
        "23        0000715957            5248             dominion energy, inc                   Dominion Energy Inc.      1      17484.0       5386.0           0.999985                          2.0       both              1.0"
       ]
      },
-     "execution_count": 145,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3490,7 +2704,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 53,
    "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea",
    "metadata": {},
    "outputs": [],
@@ -3506,7 +2720,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": 54,
    "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe",
    "metadata": {},
    "outputs": [
@@ -3516,7 +2730,7 @@
        "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)"
       ]
      },
-     "execution_count": 147,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3527,7 +2741,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 55,
    "id": "08932be5-b90c-440d-9efb-156cb4d63c93",
    "metadata": {},
    "outputs": [
@@ -3577,7 +2791,7 @@
        "Positive                   3                  13"
       ]
      },
-     "execution_count": 148,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3592,7 +2806,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 56,
    "id": "025c80e9-5055-4eaa-a873-38b910cd7f94",
    "metadata": {},
    "outputs": [],
@@ -3602,7 +2816,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 57,
    "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768",
    "metadata": {},
    "outputs": [
@@ -3650,7 +2864,7 @@
        "      <td>1</td>\n",
        "      <td>17793.0</td>\n",
        "      <td>5564.0</td>\n",
-       "      <td>0.927294</td>\n",
+       "      <td>0.927293</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3678,7 +2892,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6780.0</td>\n",
-       "      <td>0.986543</td>\n",
+       "      <td>0.986542</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -3706,7 +2920,7 @@
        "      <td>1</td>\n",
        "      <td>49303.0</td>\n",
        "      <td>16270.0</td>\n",
-       "      <td>0.559074</td>\n",
+       "      <td>0.559071</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -3717,14 +2931,14 @@
       ],
       "text/plain": [
        "   central_index_key  utility_id_eia  sec_company_name                     eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal     _merge  predicted_match\n",
-       "4         0001326160            5416  duke energy corp                                  NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
+       "4         0001326160            5416  duke energy corp                                  NaN      1      17793.0       5564.0           0.927293                          2.0       both              0.0\n",
        "10        0001031296            6526  firstenergy corp                          FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
-       "11        0001031296           54776  firstenergy corp  FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
+       "11        0001031296           54776  firstenergy corp  FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986542                          0.0       both              1.0\n",
        "13        0001031296           32208  firstenergy corp                    First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
-       "21        0001032208           61296     sempra energy                    Sempra Generation      1      49303.0      16270.0           0.559074                          0.0       both              0.0"
+       "21        0001032208           61296     sempra energy                    Sempra Generation      1      49303.0      16270.0           0.559071                          0.0       both              0.0"
       ]
      },
-     "execution_count": 150,
+     "execution_count": 57,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3851,7 +3065,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 58,
    "id": "92172e2f-39ba-49e3-8312-98597256ca4f",
    "metadata": {},
    "outputs": [],
@@ -3867,17 +3081,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
+   "execution_count": 59,
    "id": "07ca81ae-1b26-4cd3-ade6-75381028028a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "525"
+       "534"
       ]
      },
-     "execution_count": 154,
+     "execution_count": 59,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3885,6 +3099,672 @@
    "source": [
     "len(one_to_one_preds)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3db3175-7cf3-497c-8f22-e68a6c9c6af2",
+   "metadata": {},
+   "source": [
+    "# Add `utility_id_eia` onto the SEC table to create output table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "361b3e30-e823-4137-9062-6a00eae537fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>match_weight</th>\n",
+       "      <th>match_probability</th>\n",
+       "      <th>source_dataset_l</th>\n",
+       "      <th>source_dataset_r</th>\n",
+       "      <th>record_id_l</th>\n",
+       "      <th>record_id_r</th>\n",
+       "      <th>company_name_no_legal_l</th>\n",
+       "      <th>company_name_no_legal_r</th>\n",
+       "      <th>gamma_company_name_no_legal</th>\n",
+       "      <th>tf_company_name_no_legal_l</th>\n",
+       "      <th>tf_company_name_no_legal_r</th>\n",
+       "      <th>bf_company_name_no_legal</th>\n",
+       "      <th>bf_tf_adj_company_name_no_legal</th>\n",
+       "      <th>street_address_l</th>\n",
+       "      <th>street_address_r</th>\n",
+       "      <th>gamma_street_address</th>\n",
+       "      <th>tf_street_address_l</th>\n",
+       "      <th>tf_street_address_r</th>\n",
+       "      <th>bf_street_address</th>\n",
+       "      <th>bf_tf_adj_street_address</th>\n",
+       "      <th>state_l</th>\n",
+       "      <th>state_r</th>\n",
+       "      <th>gamma_state</th>\n",
+       "      <th>tf_state_l</th>\n",
+       "      <th>tf_state_r</th>\n",
+       "      <th>bf_state</th>\n",
+       "      <th>bf_tf_adj_state</th>\n",
+       "      <th>city_l</th>\n",
+       "      <th>city_r</th>\n",
+       "      <th>gamma_city</th>\n",
+       "      <th>tf_city_l</th>\n",
+       "      <th>tf_city_r</th>\n",
+       "      <th>bf_city</th>\n",
+       "      <th>bf_tf_adj_city</th>\n",
+       "      <th>company_name_mphone_l</th>\n",
+       "      <th>company_name_mphone_r</th>\n",
+       "      <th>match_key</th>\n",
+       "      <th>record_id_x</th>\n",
+       "      <th>sec_company_id</th>\n",
+       "      <th>central_index_key</th>\n",
+       "      <th>company_name_raw</th>\n",
+       "      <th>record_id_y</th>\n",
+       "      <th>utility_id_eia</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>274760</th>\n",
+       "      <td>29.211012</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>20588</td>\n",
+       "      <td>6741</td>\n",
+       "      <td>fibermark</td>\n",
+       "      <td>fibermark</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.033231</td>\n",
+       "      <td>161 wellington rd</td>\n",
+       "      <td>161 wellington rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>34.184780</td>\n",
+       "      <td>brattleboro</td>\n",
+       "      <td>brattleboro</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000086</td>\n",
+       "      <td>0.000086</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>78.327981</td>\n",
+       "      <td>FBRMRK</td>\n",
+       "      <td>FBRMRK</td>\n",
+       "      <td>0</td>\n",
+       "      <td>20588</td>\n",
+       "      <td>0000887591</td>\n",
+       "      <td>0000887591</td>\n",
+       "      <td>fibermark inc</td>\n",
+       "      <td>6741</td>\n",
+       "      <td>6309</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>340414</th>\n",
+       "      <td>27.884365</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>51567</td>\n",
+       "      <td>17450</td>\n",
+       "      <td>st joseph light and power</td>\n",
+       "      <td>st joseph light and power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.049847</td>\n",
+       "      <td>520 francis st</td>\n",
+       "      <td>520 francis st</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>mo</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.010118</td>\n",
+       "      <td>0.010118</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>5.192099</td>\n",
+       "      <td>st joseph</td>\n",
+       "      <td>st joseph</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>137.073967</td>\n",
+       "      <td>ST JSF LT ANT PWR</td>\n",
+       "      <td>ST JSF LT ANT PWR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>51567</td>\n",
+       "      <td>0000086251</td>\n",
+       "      <td>0000086251</td>\n",
+       "      <td>st joseph light &amp; power co</td>\n",
+       "      <td>17450</td>\n",
+       "      <td>17881</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>165487</th>\n",
+       "      <td>27.757338</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>58842</td>\n",
+       "      <td>19906</td>\n",
+       "      <td>wausau paper mills</td>\n",
+       "      <td>wausau paper mills</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.049847</td>\n",
+       "      <td>one clarks is</td>\n",
+       "      <td>one clarks is</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.467987</td>\n",
+       "      <td>wi</td>\n",
+       "      <td>wi</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.008840</td>\n",
+       "      <td>0.008840</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>5.943112</td>\n",
+       "      <td>wausau</td>\n",
+       "      <td>wausau</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>109.659173</td>\n",
+       "      <td>WS PPR MLS</td>\n",
+       "      <td>WS PPR MLS</td>\n",
+       "      <td>0</td>\n",
+       "      <td>58842</td>\n",
+       "      <td>0000105076</td>\n",
+       "      <td>0000105076</td>\n",
+       "      <td>wausau paper mills co</td>\n",
+       "      <td>19906</td>\n",
+       "      <td>20190</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>241593</th>\n",
+       "      <td>27.526514</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>24650</td>\n",
+       "      <td>8047</td>\n",
+       "      <td>green mountain power</td>\n",
+       "      <td>green mountain power</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.033231</td>\n",
+       "      <td>163 acorn ln</td>\n",
+       "      <td>163 acorn ln</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.311992</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>vt</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>0.001537</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>34.184780</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>colchester</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000183</td>\n",
+       "      <td>0.000183</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>36.553058</td>\n",
+       "      <td>KRN MNTN PWR</td>\n",
+       "      <td>KRN MNTN PWR</td>\n",
+       "      <td>0</td>\n",
+       "      <td>24650</td>\n",
+       "      <td>0000043704</td>\n",
+       "      <td>0000043704</td>\n",
+       "      <td>green mountain power corp</td>\n",
+       "      <td>8047</td>\n",
+       "      <td>7601</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>163815</th>\n",
+       "      <td>27.519606</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>39816</td>\n",
+       "      <td>13109</td>\n",
+       "      <td>northwestern public service</td>\n",
+       "      <td>northwestern public service</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>415263.133269</td>\n",
+       "      <td>0.016616</td>\n",
+       "      <td>33 third st se</td>\n",
+       "      <td>33 third st se</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.311992</td>\n",
+       "      <td>sd</td>\n",
+       "      <td>sd</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.001930</td>\n",
+       "      <td>0.001930</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>27.217182</td>\n",
+       "      <td>huron</td>\n",
+       "      <td>huron</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>0.000073</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>91.382644</td>\n",
+       "      <td>NR0WSTRN PBLK SRFS</td>\n",
+       "      <td>NR0WSTRN PBLK SRFS</td>\n",
+       "      <td>0</td>\n",
+       "      <td>39816</td>\n",
+       "      <td>0000073088</td>\n",
+       "      <td>0000073088</td>\n",
+       "      <td>northwestern public service co</td>\n",
+       "      <td>13109</td>\n",
+       "      <td>13809</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1483</th>\n",
+       "      <td>4.337121</td>\n",
+       "      <td>0.952856</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>58004</td>\n",
+       "      <td>17611</td>\n",
+       "      <td>vistacare</td>\n",
+       "      <td>stirling energy systems solar three</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>4800 n scottsdale rd</td>\n",
+       "      <td>4800 n scottsdale rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.103997</td>\n",
+       "      <td>az</td>\n",
+       "      <td>az</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.012872</td>\n",
+       "      <td>0.012872</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>4.081277</td>\n",
+       "      <td>scottsdale</td>\n",
+       "      <td>scottsdale</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.004989</td>\n",
+       "      <td>0.004989</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>1.343862</td>\n",
+       "      <td>FSTKR</td>\n",
+       "      <td>STRLNK ENRJ SSTMS SLR 0R</td>\n",
+       "      <td>1</td>\n",
+       "      <td>58004</td>\n",
+       "      <td>0000787030</td>\n",
+       "      <td>0000787030</td>\n",
+       "      <td>vistacare, inc.</td>\n",
+       "      <td>17611</td>\n",
+       "      <td>56168</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>218453</th>\n",
+       "      <td>4.272157</td>\n",
+       "      <td>0.950792</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>19174</td>\n",
+       "      <td>7605</td>\n",
+       "      <td>enovis</td>\n",
+       "      <td>genon sabine delaware</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2711 centerville rd</td>\n",
+       "      <td>2711 centerville rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.187195</td>\n",
+       "      <td>de</td>\n",
+       "      <td>de</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>4.483838</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>0.649640</td>\n",
+       "      <td>ENFS</td>\n",
+       "      <td>JNN SBN TLWR</td>\n",
+       "      <td>1</td>\n",
+       "      <td>19174</td>\n",
+       "      <td>0001420800</td>\n",
+       "      <td>0001420800</td>\n",
+       "      <td>enovis corp</td>\n",
+       "      <td>7605</td>\n",
+       "      <td>56922</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055</th>\n",
+       "      <td>4.272157</td>\n",
+       "      <td>0.950792</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1650</td>\n",
+       "      <td>16368</td>\n",
+       "      <td>aisystems</td>\n",
+       "      <td>shannon wind</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>2711 centerville rd</td>\n",
+       "      <td>2711 centerville rd</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.187195</td>\n",
+       "      <td>de</td>\n",
+       "      <td>de</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>4.483838</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>0.649640</td>\n",
+       "      <td>ASSTMS</td>\n",
+       "      <td>XNN WNT</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1650</td>\n",
+       "      <td>0001328769</td>\n",
+       "      <td>0001328769</td>\n",
+       "      <td>aisystems, inc.</td>\n",
+       "      <td>16368</td>\n",
+       "      <td>58872</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7216</th>\n",
+       "      <td>4.272157</td>\n",
+       "      <td>0.950792</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>32403</td>\n",
+       "      <td>14089</td>\n",
+       "      <td>lease investment flight trust</td>\n",
+       "      <td>pasadena statutory trust</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1100 north market st</td>\n",
+       "      <td>1100 north market st</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.187195</td>\n",
+       "      <td>de</td>\n",
+       "      <td>de</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>4.483838</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>0.649640</td>\n",
+       "      <td>LS INFSTMNT FLT TRST</td>\n",
+       "      <td>PSTN STTTR TRST</td>\n",
+       "      <td>1</td>\n",
+       "      <td>32403</td>\n",
+       "      <td>0001158389</td>\n",
+       "      <td>0001158389</td>\n",
+       "      <td>lease investment flight trust</td>\n",
+       "      <td>14089</td>\n",
+       "      <td>61235</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6113</th>\n",
+       "      <td>4.272157</td>\n",
+       "      <td>0.950792</td>\n",
+       "      <td>__splink__input_table_0</td>\n",
+       "      <td>__splink__input_table_1</td>\n",
+       "      <td>1626</td>\n",
+       "      <td>16195</td>\n",
+       "      <td>airplanes us trust</td>\n",
+       "      <td>se solar trust v c</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.986046</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1100 north market st</td>\n",
+       "      <td>1100 north market st</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>9605.781694</td>\n",
+       "      <td>0.187195</td>\n",
+       "      <td>de</td>\n",
+       "      <td>de</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>0.011717</td>\n",
+       "      <td>15.445559</td>\n",
+       "      <td>4.483838</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>wilmington</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>0.010321</td>\n",
+       "      <td>102.014123</td>\n",
+       "      <td>0.649640</td>\n",
+       "      <td>ARPLNS US TRST</td>\n",
+       "      <td>S SLR TRST F K</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1626</td>\n",
+       "      <td>0001004540</td>\n",
+       "      <td>0001004540</td>\n",
+       "      <td>airplanes us trust</td>\n",
+       "      <td>16195</td>\n",
+       "      <td>56900</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>534 rows × 43 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r        company_name_no_legal_l              company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal      street_address_l      street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l     company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                company_name_raw  record_id_y  utility_id_eia\n",
+       "274760     29.211012           1.000000  __splink__input_table_0  __splink__input_table_1        20588         6741                      fibermark                            fibermark                            2                    0.000037                    0.000037             415263.133269                         0.033231     161 wellington rd     161 wellington rd                     2             0.000024             0.000024        9605.781694                  0.467987      vt      vt            1    0.001537    0.001537  15.445559        34.184780  brattleboro  brattleboro           2   0.000086   0.000086  102.014123       78.327981                FBRMRK                    FBRMRK         0        20588     0000887591        0000887591                   fibermark inc         6741            6309\n",
+       "340414     27.884365           1.000000  __splink__input_table_0  __splink__input_table_1        51567        17450      st joseph light and power            st joseph light and power                            2                    0.000024                    0.000024             415263.133269                         0.049847        520 francis st        520 francis st                     2             0.000024             0.000024        9605.781694                  0.467987      mo      mo            1    0.010118    0.010118  15.445559         5.192099    st joseph    st joseph           2   0.000049   0.000049  102.014123      137.073967     ST JSF LT ANT PWR         ST JSF LT ANT PWR         0        51567     0000086251        0000086251      st joseph light & power co        17450           17881\n",
+       "165487     27.757338           1.000000  __splink__input_table_0  __splink__input_table_1        58842        19906             wausau paper mills                   wausau paper mills                            2                    0.000024                    0.000024             415263.133269                         0.049847         one clarks is         one clarks is                     2             0.000024             0.000024        9605.781694                  0.467987      wi      wi            1    0.008840    0.008840  15.445559         5.943112       wausau       wausau           2   0.000061   0.000061  102.014123      109.659173            WS PPR MLS                WS PPR MLS         0        58842     0000105076        0000105076           wausau paper mills co        19906           20190\n",
+       "241593     27.526514           1.000000  __splink__input_table_0  __splink__input_table_1        24650         8047           green mountain power                 green mountain power                            2                    0.000037                    0.000037             415263.133269                         0.033231          163 acorn ln          163 acorn ln                     2             0.000037             0.000037        9605.781694                  0.311992      vt      vt            1    0.001537    0.001537  15.445559        34.184780   colchester   colchester           2   0.000183   0.000183  102.014123       36.553058          KRN MNTN PWR              KRN MNTN PWR         0        24650     0000043704        0000043704       green mountain power corp         8047            7601\n",
+       "163815     27.519606           1.000000  __splink__input_table_0  __splink__input_table_1        39816        13109    northwestern public service          northwestern public service                            2                    0.000073                    0.000073             415263.133269                         0.016616        33 third st se        33 third st se                     2             0.000037             0.000037        9605.781694                  0.311992      sd      sd            1    0.001930    0.001930  15.445559        27.217182        huron        huron           2   0.000073   0.000073  102.014123       91.382644    NR0WSTRN PBLK SRFS        NR0WSTRN PBLK SRFS         0        39816     0000073088        0000073088  northwestern public service co        13109           13809\n",
+       "...              ...                ...                      ...                      ...          ...          ...                            ...                                  ...                          ...                         ...                         ...                       ...                              ...                   ...                   ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...         ...             ...                   ...                       ...       ...          ...            ...               ...                             ...          ...             ...\n",
+       "1483        4.337121           0.952856  __splink__input_table_0  __splink__input_table_1        58004        17611                      vistacare  stirling energy systems solar three                            0                    0.000024                    0.000037                  0.986046                         1.000000  4800 n scottsdale rd  4800 n scottsdale rd                     2             0.000110             0.000110        9605.781694                  0.103997      az      az            1    0.012872    0.012872  15.445559         4.081277   scottsdale   scottsdale           2   0.004989   0.004989  102.014123        1.343862                 FSTKR  STRLNK ENRJ SSTMS SLR 0R         1        58004     0000787030        0000787030                 vistacare, inc.        17611           56168\n",
+       "218453      4.272157           0.950792  __splink__input_table_0  __splink__input_table_1        19174         7605                         enovis                genon sabine delaware                            0                    0.000012                    0.000012                  0.986046                         1.000000   2711 centerville rd   2711 centerville rd                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640                  ENFS              JNN SBN TLWR         1        19174     0001420800        0001420800                     enovis corp         7605           56922\n",
+       "1055        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1         1650        16368                      aisystems                         shannon wind                            0                    0.000024                    0.000024                  0.986046                         1.000000   2711 centerville rd   2711 centerville rd                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640                ASSTMS                   XNN WNT         1         1650     0001328769        0001328769                 aisystems, inc.        16368           58872\n",
+       "7216        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1        32403        14089  lease investment flight trust             pasadena statutory trust                            0                    0.000012                    0.000012                  0.986046                         1.000000  1100 north market st  1100 north market st                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640  LS INFSTMNT FLT TRST           PSTN STTTR TRST         1        32403     0001158389        0001158389   lease investment flight trust        14089           61235\n",
+       "6113        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1         1626        16195             airplanes us trust                   se solar trust v c                            0                    0.000012                    0.000012                  0.986046                         1.000000  1100 north market st  1100 north market st                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640        ARPLNS US TRST            S SLR TRST F K         1         1626     0001004540        0001004540              airplanes us trust        16195           56900\n",
+       "\n",
+       "[534 rows x 43 columns]"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "one_to_one_preds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out_df = sec_df.merge(\n",
+    "    one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n",
+    "    how=\"left\",\n",
+    "    on=\"sec_company_id\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "utility_id_eia\n",
+       "True     59895\n",
+       "False     1131\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out_df.utility_id_eia.isnull().value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(one_to_one_preds"
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb
index d6045f9..061a227 100644
--- a/notebooks/20-kl-validate-sec-output-table.ipynb
+++ b/notebooks/20-kl-validate-sec-output-table.ipynb
@@ -50,7 +50,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "id": "c1795acc-8005-4b6d-be4d-27c722b634f1",
    "metadata": {},
    "outputs": [],
@@ -58,6 +58,284 @@
     "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "291ce873-4971-4e03-985a-65dbdd8b0850",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sec_company_id</th>\n",
+       "      <th>company_name_raw</th>\n",
+       "      <th>location_of_inc</th>\n",
+       "      <th>own_per</th>\n",
+       "      <th>filename</th>\n",
+       "      <th>report_date</th>\n",
+       "      <th>report_year</th>\n",
+       "      <th>company_name</th>\n",
+       "      <th>company_name_no_legal</th>\n",
+       "      <th>company_name_mphone</th>\n",
+       "      <th>parent_company_cik</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0000000020_colormax limited_united kingdom</td>\n",
+       "      <td>colormax limited</td>\n",
+       "      <td>united kingdom</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/20/0000893220-06-000650.txt</td>\n",
+       "      <td>2006-03-23</td>\n",
+       "      <td>2006</td>\n",
+       "      <td>colormax limited</td>\n",
+       "      <td>colormax</td>\n",
+       "      <td>KLRMKS</td>\n",
+       "      <td>0000000020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0000000020_gundlach equipment corporation_dela...</td>\n",
+       "      <td>gundlach equipment corporation</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/20/0000950123-10-024631.txt</td>\n",
+       "      <td>2010-03-15</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>gundlach equipment corporation</td>\n",
+       "      <td>gundlach equipment</td>\n",
+       "      <td>KNTLX EKPMNT</td>\n",
+       "      <td>0000000020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0000000020_jeffrey rader ab_sweden</td>\n",
+       "      <td>jeffrey rader ab</td>\n",
+       "      <td>sweden</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/20/0000950123-10-024631.txt</td>\n",
+       "      <td>2010-03-15</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>jeffrey rader ab</td>\n",
+       "      <td>jeffrey rader ab</td>\n",
+       "      <td>JFR RTR AB</td>\n",
+       "      <td>0000000020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0000000020_jeffrey rader canada company_canada</td>\n",
+       "      <td>jeffrey rader canada company</td>\n",
+       "      <td>canada</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/20/0000950123-10-024631.txt</td>\n",
+       "      <td>2010-03-15</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>jeffrey rader canada company</td>\n",
+       "      <td>jeffrey rader canada</td>\n",
+       "      <td>JFR RTR KNT</td>\n",
+       "      <td>0000000020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0000000020_jeffrey rader corporation_delaware</td>\n",
+       "      <td>jeffrey rader corporation</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/20/0000950123-10-024631.txt</td>\n",
+       "      <td>2010-03-15</td>\n",
+       "      <td>2010</td>\n",
+       "      <td>jeffrey rader corporation</td>\n",
+       "      <td>jeffrey rader</td>\n",
+       "      <td>JFR RTR</td>\n",
+       "      <td>0000000020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055982</th>\n",
+       "      <td>0001967649_vestis supply chain limited liabili...</td>\n",
+       "      <td>vestis (supply chain), llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1967649/0001967649-23-000025.txt</td>\n",
+       "      <td>2023-12-21</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>vestis supply chain limited liability company</td>\n",
+       "      <td>vestis supply chain</td>\n",
+       "      <td>FSTS SPL XN</td>\n",
+       "      <td>0001967649</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055983</th>\n",
+       "      <td>0001967649_vestis syracuse limited liability c...</td>\n",
+       "      <td>vestis (syracuse), llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1967649/0001967649-23-000025.txt</td>\n",
+       "      <td>2023-12-21</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>vestis syracuse limited liability company</td>\n",
+       "      <td>vestis syracuse</td>\n",
+       "      <td>FSTS SRKS</td>\n",
+       "      <td>0001967649</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055984</th>\n",
+       "      <td>0001967649_vestis texas limited liability comp...</td>\n",
+       "      <td>vestis (texas), llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1967649/0001967649-23-000025.txt</td>\n",
+       "      <td>2023-12-21</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>vestis texas limited liability company</td>\n",
+       "      <td>vestis texas</td>\n",
+       "      <td>FSTS TKSS</td>\n",
+       "      <td>0001967649</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055985</th>\n",
+       "      <td>0001967649_vestis west adams limited liability...</td>\n",
+       "      <td>vestis (west adams), llc</td>\n",
+       "      <td>delaware</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>edgar/data/1967649/0001967649-23-000025.txt</td>\n",
+       "      <td>2023-12-21</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>vestis west adams limited liability company</td>\n",
+       "      <td>vestis west adams</td>\n",
+       "      <td>FSTS WST ATMS</td>\n",
+       "      <td>0001967649</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1055986</th>\n",
+       "      <td>0001978811_gouverneur savings and loan associa...</td>\n",
+       "      <td>gouverneur savings and loan association</td>\n",
+       "      <td>new york</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>edgar/data/1978811/0001558370-23-020009.txt</td>\n",
+       "      <td>2023-12-26</td>\n",
+       "      <td>2023</td>\n",
+       "      <td>gouverneur savings and loan association</td>\n",
+       "      <td>gouverneur savings and loan</td>\n",
+       "      <td>KFRNR SFNKS ANT LN</td>\n",
+       "      <td>0001978811</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1055987 rows × 11 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            sec_company_id  \\\n",
+       "0               0000000020_colormax limited_united kingdom   \n",
+       "1        0000000020_gundlach equipment corporation_dela...   \n",
+       "2                       0000000020_jeffrey rader ab_sweden   \n",
+       "3           0000000020_jeffrey rader canada company_canada   \n",
+       "4            0000000020_jeffrey rader corporation_delaware   \n",
+       "...                                                    ...   \n",
+       "1055982  0001967649_vestis supply chain limited liabili...   \n",
+       "1055983  0001967649_vestis syracuse limited liability c...   \n",
+       "1055984  0001967649_vestis texas limited liability comp...   \n",
+       "1055985  0001967649_vestis west adams limited liability...   \n",
+       "1055986  0001978811_gouverneur savings and loan associa...   \n",
+       "\n",
+       "                                company_name_raw location_of_inc own_per  \\\n",
+       "0                               colormax limited  united kingdom     NaN   \n",
+       "1                 gundlach equipment corporation        delaware     NaN   \n",
+       "2                               jeffrey rader ab          sweden     NaN   \n",
+       "3                   jeffrey rader canada company          canada     NaN   \n",
+       "4                      jeffrey rader corporation        delaware     NaN   \n",
+       "...                                          ...             ...     ...   \n",
+       "1055982               vestis (supply chain), llc        delaware     NaN   \n",
+       "1055983                   vestis (syracuse), llc        delaware     NaN   \n",
+       "1055984                      vestis (texas), llc        delaware     NaN   \n",
+       "1055985                 vestis (west adams), llc        delaware     NaN   \n",
+       "1055986  gouverneur savings and loan association        new york   100.0   \n",
+       "\n",
+       "                                            filename report_date  report_year  \\\n",
+       "0             edgar/data/20/0000893220-06-000650.txt  2006-03-23         2006   \n",
+       "1             edgar/data/20/0000950123-10-024631.txt  2010-03-15         2010   \n",
+       "2             edgar/data/20/0000950123-10-024631.txt  2010-03-15         2010   \n",
+       "3             edgar/data/20/0000950123-10-024631.txt  2010-03-15         2010   \n",
+       "4             edgar/data/20/0000950123-10-024631.txt  2010-03-15         2010   \n",
+       "...                                              ...         ...          ...   \n",
+       "1055982  edgar/data/1967649/0001967649-23-000025.txt  2023-12-21         2023   \n",
+       "1055983  edgar/data/1967649/0001967649-23-000025.txt  2023-12-21         2023   \n",
+       "1055984  edgar/data/1967649/0001967649-23-000025.txt  2023-12-21         2023   \n",
+       "1055985  edgar/data/1967649/0001967649-23-000025.txt  2023-12-21         2023   \n",
+       "1055986  edgar/data/1978811/0001558370-23-020009.txt  2023-12-26         2023   \n",
+       "\n",
+       "                                          company_name  \\\n",
+       "0                                     colormax limited   \n",
+       "1                       gundlach equipment corporation   \n",
+       "2                                     jeffrey rader ab   \n",
+       "3                         jeffrey rader canada company   \n",
+       "4                            jeffrey rader corporation   \n",
+       "...                                                ...   \n",
+       "1055982  vestis supply chain limited liability company   \n",
+       "1055983      vestis syracuse limited liability company   \n",
+       "1055984         vestis texas limited liability company   \n",
+       "1055985    vestis west adams limited liability company   \n",
+       "1055986        gouverneur savings and loan association   \n",
+       "\n",
+       "               company_name_no_legal company_name_mphone parent_company_cik  \n",
+       "0                           colormax              KLRMKS         0000000020  \n",
+       "1                 gundlach equipment        KNTLX EKPMNT         0000000020  \n",
+       "2                   jeffrey rader ab          JFR RTR AB         0000000020  \n",
+       "3               jeffrey rader canada         JFR RTR KNT         0000000020  \n",
+       "4                      jeffrey rader             JFR RTR         0000000020  \n",
+       "...                              ...                 ...                ...  \n",
+       "1055982          vestis supply chain         FSTS SPL XN         0001967649  \n",
+       "1055983              vestis syracuse           FSTS SRKS         0001967649  \n",
+       "1055984                 vestis texas           FSTS TKSS         0001967649  \n",
+       "1055985            vestis west adams       FSTS WST ATMS         0001967649  \n",
+       "1055986  gouverneur savings and loan  KFRNR SFNKS ANT LN         0001978811  \n",
+       "\n",
+       "[1055987 rows x 11 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex21_df"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 25,
diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py
index 4fd2f14..cec954d 100644
--- a/src/mozilla_sec_eia/models/sec10k/__init__.py
+++ b/src/mozilla_sec_eia/models/sec10k/__init__.py
@@ -36,7 +36,6 @@
 ex21_data_assets = load_assets_from_modules([ex_21.data])
 shared_assets = load_assets_from_modules([extract])
 
-
 basic_10k_production_job = model_jobs.create_production_model_job(
     "basic_10k_extraction",
     basic_10k.production_assets,

From 26b1a72c66f1642f58564583fae9ba7a3ed81673 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Thu, 19 Dec 2024 11:22:23 -0800
Subject: [PATCH 158/161] add markdown cell note

---
 notebooks/18-kl-splink-sec-eia.ipynb | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index a105e3b..81f3513 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -2226,6 +2226,14 @@
     "preds_validation_df[preds_validation_df.match_probability > .9].head(3)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "07fbec17-cef2-4b9c-a005-1623c65c5e20",
+   "metadata": {},
+   "source": [
+    "Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 47,

From 70427a01f08d1eb5704c4da8ec3cefab0b5d4277 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Thu, 19 Dec 2024 11:23:14 -0800
Subject: [PATCH 159/161] make asset not multi asset

---
 .../models/sec_eia_record_linkage/transform_eia_input.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
index c832cf0..c8f311c 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pandas as pd
-from dagster import AssetOut, multi_asset
+from dagster import AssetOut, asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
     expand_street_name_abbreviations,
@@ -74,7 +74,7 @@ def harvest_eia861_utilities():
     return eia861_df
 
 
-@multi_asset(
+@asset(
     outs={
         "core_eia__parents_and_subsidiaries": AssetOut(
             io_manager_key="pandas_parquet_io_manager"

From 136709dc99fda13c0c46067eea97bf8f75ac24d9 Mon Sep 17 00:00:00 2001
From: zschira <zach.schira@catalyst.coop>
Date: Tue, 7 Jan 2025 14:57:58 -0500
Subject: [PATCH 160/161] Fix asset keywords

---
 .../sec_eia_record_linkage/transform_eia_input.py      | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
index c8f311c..b12ac71 100644
--- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
+++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pandas as pd
-from dagster import AssetOut, asset
+from dagster import asset
 
 from mozilla_sec_eia.library.record_linkage_utils import (
     expand_street_name_abbreviations,
@@ -75,12 +75,8 @@ def harvest_eia861_utilities():
 
 
 @asset(
-    outs={
-        "core_eia__parents_and_subsidiaries": AssetOut(
-            io_manager_key="pandas_parquet_io_manager"
-        )
-        # TODO: allow year partitions?
-    }
+    name="core_eia__parents_and_subsidiaries",
+    io_manager_key="pandas_parquet_io_manager",
 )
 # TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS?
 def eia_rl_input_table():

From 4da70e06e7b83f432e3324a13b87021a1c3661b3 Mon Sep 17 00:00:00 2001
From: Katie Lamb <katherine.lamb@catalyst.coop>
Date: Wed, 15 Jan 2025 20:53:49 -0800
Subject: [PATCH 161/161] splink notebook update

---
 notebooks/18-kl-splink-sec-eia.ipynb | 1050 +++++++++++++-------------
 1 file changed, 506 insertions(+), 544 deletions(-)

diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb
index 81f3513..88d3351 100644
--- a/notebooks/18-kl-splink-sec-eia.ipynb
+++ b/notebooks/18-kl-splink-sec-eia.ipynb
@@ -199,7 +199,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "856c14d8-3250-4650-a2db-3808b4718f19",
    "metadata": {},
    "outputs": [
@@ -209,7 +209,7 @@
        "False"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -229,7 +229,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "id": "842fa02e-5202-445c-b728-72bce42e740d",
    "metadata": {},
    "outputs": [
@@ -240,7 +240,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "id": "b53e6244-f0ca-4256-bc09-9c3264675389",
    "metadata": {},
    "outputs": [
@@ -262,7 +262,7 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669",
    "metadata": {},
    "outputs": [],
@@ -300,7 +300,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 15,
    "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05",
    "metadata": {},
    "outputs": [],
@@ -310,7 +310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 16,
    "id": "4bab1568-6a55-427c-9a78-e44db8b0584d",
    "metadata": {},
    "outputs": [
@@ -319,23 +319,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed {\n",
+       "  #altair-viz-b405bc71331348a88bc8df1ee06203f9.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed details,\n",
-       "  #altair-viz-238a2fea13f7415aa7121762b9fa3832.vega-embed details summary {\n",
+       "  #altair-viz-b405bc71331348a88bc8df1ee06203f9.vega-embed details,\n",
+       "  #altair-viz-b405bc71331348a88bc8df1ee06203f9.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-238a2fea13f7415aa7121762b9fa3832\"></div>\n",
+       "<div id=\"altair-viz-b405bc71331348a88bc8df1ee06203f9\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-238a2fea13f7415aa7121762b9fa3832\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-238a2fea13f7415aa7121762b9fa3832\");\n",
+       "    if (outputDiv.id !== \"altair-viz-b405bc71331348a88bc8df1ee06203f9\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-b405bc71331348a88bc8df1ee06203f9\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -388,7 +388,7 @@
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 17,
    "id": "6b9479e3-e836-4407-a2b6-926c185065a8",
    "metadata": {},
    "outputs": [
@@ -408,23 +408,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed {\n",
+       "  #altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed details,\n",
-       "  #altair-viz-299c143177c24d0caed0a11feac611ed.vega-embed details summary {\n",
+       "  #altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1.vega-embed details,\n",
+       "  #altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-299c143177c24d0caed0a11feac611ed\"></div>\n",
+       "<div id=\"altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-299c143177c24d0caed0a11feac611ed\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-299c143177c24d0caed0a11feac611ed\");\n",
+       "    if (outputDiv.id !== \"altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-7383a1ed8b3e45db9b6a99a6a3cc2fb1\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -477,7 +477,7 @@
        "alt.LayerChart(...)"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -488,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 18,
    "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11",
    "metadata": {},
    "outputs": [
@@ -497,23 +497,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed {\n",
+       "  #altair-viz-c963f89156a04d07b8d7d08e55d13a39.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed details,\n",
-       "  #altair-viz-e461e2802f9548f4a5e8af9a3213a168.vega-embed details summary {\n",
+       "  #altair-viz-c963f89156a04d07b8d7d08e55d13a39.vega-embed details,\n",
+       "  #altair-viz-c963f89156a04d07b8d7d08e55d13a39.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-e461e2802f9548f4a5e8af9a3213a168\"></div>\n",
+       "<div id=\"altair-viz-c963f89156a04d07b8d7d08e55d13a39\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-e461e2802f9548f4a5e8af9a3213a168\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-e461e2802f9548f4a5e8af9a3213a168\");\n",
+       "    if (outputDiv.id !== \"altair-viz-c963f89156a04d07b8d7d08e55d13a39\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-c963f89156a04d07b8d7d08e55d13a39\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -559,14 +559,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9997377991676331, \"percentile_inc_nulls\": 0.9997377991676331, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9989348649978638, \"percentile_inc_nulls\": 0.9989348649978638, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9960836172103882, \"percentile_inc_nulls\": 0.9960836172103882, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9845311641693115, \"percentile_inc_nulls\": 0.9845311641693115, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9480876922607422, \"percentile_inc_nulls\": 0.9480876922607422, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2224.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.8398387432098389, \"percentile_inc_nulls\": 0.8398387432098389, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6606.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.5750991106033325, \"percentile_inc_nulls\": 0.5750991106033325, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16156.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35096.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 46111 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"comerica inc /new/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"camelot corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"stillwater mining co /de/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 6, \"group_name\": \"_company_name_\", \"value\": \"microvision incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"moringa acquisition corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk38\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk36\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"morgan stanley capital i trust 2016 ubs11\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"morgan stanley capital i trust 2016 ubs12\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 8]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.0132485032081604, \"percentile_inc_nulls\": 0.013862967491149902, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.012494266033172607, \"percentile_inc_nulls\": 0.013109147548675537, \"value_count\": 46, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.011789202690124512, \"percentile_inc_nulls\": 0.012404561042785645, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01121532917022705, \"percentile_inc_nulls\": 0.011831045150756836, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010674238204956055, \"percentile_inc_nulls\": 0.011290252208709717, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010149538516998291, \"percentile_inc_nulls\": 0.010765910148620605, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009674012660980225, \"percentile_inc_nulls\": 0.010290682315826416, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009264111518859863, \"percentile_inc_nulls\": 0.009881019592285156, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008870601654052734, \"percentile_inc_nulls\": 0.009487748146057129, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008181929588317871, \"percentile_inc_nulls\": 0.008799552917480469, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007870376110076904, \"percentile_inc_nulls\": 0.008488178253173828, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007575273513793945, \"percentile_inc_nulls\": 0.008193254470825195, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007050573825836182, \"percentile_inc_nulls\": 0.007668852806091309, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00680464506149292, \"percentile_inc_nulls\": 0.007423043251037598, \"value_count\": 15, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.006115972995758057, \"percentile_inc_nulls\": 0.0067348480224609375, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00590282678604126, \"percentile_inc_nulls\": 0.006521821022033691, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00570601224899292, \"percentile_inc_nulls\": 0.006325185298919678, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.005164921283721924, \"percentile_inc_nulls\": 0.005784392356872559, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0041811466217041016, \"percentile_inc_nulls\": 0.00480121374130249, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003885984420776367, \"percentile_inc_nulls\": 0.004506289958953857, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003623664379119873, \"percentile_inc_nulls\": 0.004244089126586914, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0035088658332824707, \"percentile_inc_nulls\": 0.0041294097900390625, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.002721846103668213, \"percentile_inc_nulls\": 0.003342866897583008, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0019840002059936523, \"percentile_inc_nulls\": 0.002605438232421875, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0012625455856323242, \"percentile_inc_nulls\": 0.00188446044921875, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0007214546203613281, \"percentile_inc_nulls\": 0.0013436675071716309, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0003935098648071289, \"percentile_inc_nulls\": 0.0010159611701965332, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0006226897239685059, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.8305076360702515, \"percentile_inc_nulls\": 0.8306131958961487, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.691791832447052, \"percentile_inc_nulls\": 0.6919837594032288, \"value_count\": 8460, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 8460.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.6141864061355591, \"percentile_inc_nulls\": 0.6144266128540039, \"value_count\": 4733, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 4733.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.563307523727417, \"percentile_inc_nulls\": 0.5635794401168823, \"value_count\": 3103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.5253000259399414, \"percentile_inc_nulls\": 0.5255956649780273, \"value_count\": 2318, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2318.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.49045711755752563, \"percentile_inc_nulls\": 0.4907744526863098, \"value_count\": 2125, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2125.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4584508538246155, \"percentile_inc_nulls\": 0.45878803730010986, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.42987143993377686, \"percentile_inc_nulls\": 0.4302264451980591, \"value_count\": 1743, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1743.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4037187695503235, \"percentile_inc_nulls\": 0.4040900468826294, \"value_count\": 1595, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1595.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3788614273071289, \"percentile_inc_nulls\": 0.3792482018470764, \"value_count\": 1516, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1516.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.35418444871902466, \"percentile_inc_nulls\": 0.35458654165267944, \"value_count\": 1505, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1505.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.33116352558135986, \"percentile_inc_nulls\": 0.3315799832344055, \"value_count\": 1404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3082737326622009, \"percentile_inc_nulls\": 0.3087044954299927, \"value_count\": 1396, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1396.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.2873516082763672, \"percentile_inc_nulls\": 0.28779536485671997, \"value_count\": 1276, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1276.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.26756083965301514, \"percentile_inc_nulls\": 0.2680169343948364, \"value_count\": 1207, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1207.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.24914735555648804, \"percentile_inc_nulls\": 0.2496148943901062, \"value_count\": 1123, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1123.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.23243916034698486, \"percentile_inc_nulls\": 0.23291712999343872, \"value_count\": 1019, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.21587854623794556, \"percentile_inc_nulls\": 0.21636676788330078, \"value_count\": 1010, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.20199054479599, \"percentile_inc_nulls\": 0.20248746871948242, \"value_count\": 847, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 847.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.18875843286514282, \"percentile_inc_nulls\": 0.18926358222961426, \"value_count\": 807, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 807.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.17613303661346436, \"percentile_inc_nulls\": 0.17664599418640137, \"value_count\": 770, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 770.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.1640486717224121, \"percentile_inc_nulls\": 0.1645691990852356, \"value_count\": 737, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 737.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.15265297889709473, \"percentile_inc_nulls\": 0.15318059921264648, \"value_count\": 695, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 695.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.14209353923797607, \"percentile_inc_nulls\": 0.14262771606445312, \"value_count\": 644, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.13258343935012817, \"percentile_inc_nulls\": 0.13312357664108276, \"value_count\": 580, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.12354886531829834, \"percentile_inc_nulls\": 0.12409466505050659, \"value_count\": 551, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 551.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.11457991600036621, \"percentile_inc_nulls\": 0.11513125896453857, \"value_count\": 547, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 547.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10720139741897583, \"percentile_inc_nulls\": 0.1077573299407959, \"value_count\": 450, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10013443231582642, \"percentile_inc_nulls\": 0.10069477558135986, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0935102105140686, \"percentile_inc_nulls\": 0.09407466650009155, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08788615465164185, \"percentile_inc_nulls\": 0.08845412731170654, \"value_count\": 343, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08291792869567871, \"percentile_inc_nulls\": 0.08348900079727173, \"value_count\": 303, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07842528820037842, \"percentile_inc_nulls\": 0.0789991021156311, \"value_count\": 274, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07416212558746338, \"percentile_inc_nulls\": 0.07473862171173096, \"value_count\": 260, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07007938623428345, \"percentile_inc_nulls\": 0.07065838575363159, \"value_count\": 249, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.06240570545196533, \"percentile_inc_nulls\": 0.06298953294754028, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0587000846862793, \"percentile_inc_nulls\": 0.05928617715835571, \"value_count\": 226, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05512559413909912, \"percentile_inc_nulls\": 0.055713951587677, \"value_count\": 218, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05199384689331055, \"percentile_inc_nulls\": 0.052584171295166016, \"value_count\": 191, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.048944056034088135, \"percentile_inc_nulls\": 0.049536287784576416, \"value_count\": 186, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04597628116607666, \"percentile_inc_nulls\": 0.0465703010559082, \"value_count\": 181, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04348397254943848, \"percentile_inc_nulls\": 0.044079601764678955, \"value_count\": 152, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.041270434856414795, \"percentile_inc_nulls\": 0.04186737537384033, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03918802738189697, \"percentile_inc_nulls\": 0.039786338806152344, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.037335216999053955, \"percentile_inc_nulls\": 0.03793466091156006, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03549879789352417, \"percentile_inc_nulls\": 0.036099374294281006, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03369516134262085, \"percentile_inc_nulls\": 0.03429687023162842, \"value_count\": 110, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03195708990097046, \"percentile_inc_nulls\": 0.032559871673583984, \"value_count\": 106, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03026825189590454, \"percentile_inc_nulls\": 0.030872106552124023, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.028595805168151855, \"percentile_inc_nulls\": 0.02920067310333252, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.026972532272338867, \"percentile_inc_nulls\": 0.027578413486480713, \"value_count\": 99, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.02552962303161621, \"percentile_inc_nulls\": 0.026136398315429688, \"value_count\": 88, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.022938966751098633, \"percentile_inc_nulls\": 0.023547351360321045, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.021774768829345703, \"percentile_inc_nulls\": 0.022383928298950195, \"value_count\": 71, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.020659804344177246, \"percentile_inc_nulls\": 0.021269619464874268, \"value_count\": 68, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01956123113632202, \"percentile_inc_nulls\": 0.020171701908111572, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.018577396869659424, \"percentile_inc_nulls\": 0.019188523292541504, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.017626404762268066, \"percentile_inc_nulls\": 0.0182381272315979, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.015757203102111816, \"percentile_inc_nulls\": 0.016370058059692383, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014838993549346924, \"percentile_inc_nulls\": 0.015452444553375244, \"value_count\": 56, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014019131660461426, \"percentile_inc_nulls\": 0.014633119106292725, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 38 values (0.1%) are null and there are 172 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10337, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 8460, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 4733, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 3103, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2318, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2125, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1952, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1743, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1595, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1516, \"group_name\": \"_state_\", \"value\": \"nv\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"s9\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"r4\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10337]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.34724318981170654, \"percentile_inc_nulls\": 0.3473929166793823, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3439651131629944, \"percentile_inc_nulls\": 0.34411561489105225, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33994948863983154, \"percentile_inc_nulls\": 0.3401009440422058, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3344423770904541, \"percentile_inc_nulls\": 0.33459508419036865, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33213138580322266, \"percentile_inc_nulls\": 0.3322846293449402, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3291155695915222, \"percentile_inc_nulls\": 0.3292694687843323, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3261653184890747, \"percentile_inc_nulls\": 0.32631993293762207, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3211171627044678, \"percentile_inc_nulls\": 0.32127290964126587, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3168884515762329, \"percentile_inc_nulls\": 0.3170452117919922, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.31482332944869995, \"percentile_inc_nulls\": 0.31498050689697266, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3101193308830261, \"percentile_inc_nulls\": 0.3102775812149048, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30749690532684326, \"percentile_inc_nulls\": 0.3076557517051697, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3030223846435547, \"percentile_inc_nulls\": 0.3031822443008423, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30115389823913574, \"percentile_inc_nulls\": 0.30131417512893677, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2987281084060669, \"percentile_inc_nulls\": 0.2988889813423157, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2957778573036194, \"percentile_inc_nulls\": 0.29593944549560547, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.29348325729370117, \"percentile_inc_nulls\": 0.29364532232284546, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2884678244590759, \"percentile_inc_nulls\": 0.2886310815811157, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.285222589969635, \"percentile_inc_nulls\": 0.2853865623474121, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2831246256828308, \"percentile_inc_nulls\": 0.2832890748977661, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2770274877548218, \"percentile_inc_nulls\": 0.2771933078765869, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2735854983329773, \"percentile_inc_nulls\": 0.2737521529197693, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2726348638534546, \"percentile_inc_nulls\": 0.2728017568588257, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26942241191864014, \"percentile_inc_nulls\": 0.26959002017974854, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26632463932037354, \"percentile_inc_nulls\": 0.26649296283721924, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26078474521636963, \"percentile_inc_nulls\": 0.2609543204307556, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.25586771965026855, \"percentile_inc_nulls\": 0.2560384273529053, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24996721744537354, \"percentile_inc_nulls\": 0.2501392960548401, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24544352293014526, \"percentile_inc_nulls\": 0.245616614818573, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23895299434661865, \"percentile_inc_nulls\": 0.23912757635116577, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23551106452941895, \"percentile_inc_nulls\": 0.23568642139434814, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2273159623146057, \"percentile_inc_nulls\": 0.22749322652816772, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2207762598991394, \"percentile_inc_nulls\": 0.2209550142288208, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2142857313156128, \"percentile_inc_nulls\": 0.21446597576141357, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2109420895576477, \"percentile_inc_nulls\": 0.21112310886383057, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.20648396015167236, \"percentile_inc_nulls\": 0.2066659927368164, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.1981249451637268, \"percentile_inc_nulls\": 0.19830894470214844, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.19101160764694214, \"percentile_inc_nulls\": 0.1911972165107727, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.18227559328079224, \"percentile_inc_nulls\": 0.18246322870254517, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.17460501194000244, \"percentile_inc_nulls\": 0.17479437589645386, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.16559040546417236, \"percentile_inc_nulls\": 0.16578179597854614, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 550.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.15854257345199585, \"percentile_inc_nulls\": 0.15873563289642334, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.14836424589157104, \"percentile_inc_nulls\": 0.14855962991714478, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.13669443130493164, \"percentile_inc_nulls\": 0.13689249753952026, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.12717169523239136, \"percentile_inc_nulls\": 0.12737196683883667, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.11497735977172852, \"percentile_inc_nulls\": 0.11518043279647827, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.09989839792251587, \"percentile_inc_nulls\": 0.10010486841201782, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0844915509223938, \"percentile_inc_nulls\": 0.08470159769058228, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 940.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.06585592031478882, \"percentile_inc_nulls\": 0.06607019901275635, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.040352702140808105, \"percentile_inc_nulls\": 0.0405728816986084, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00022941827774047852, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2462.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8892185091972351, \"percentile_inc_nulls\": 0.8892439603805542, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8651740550994873, \"percentile_inc_nulls\": 0.8652049899101257, \"value_count\": 1467, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1467.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8483904600143433, \"percentile_inc_nulls\": 0.8484252691268921, \"value_count\": 1024, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1024.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8343932628631592, \"percentile_inc_nulls\": 0.8344312310218811, \"value_count\": 854, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 854.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8206254243850708, \"percentile_inc_nulls\": 0.8206666111946106, \"value_count\": 840, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8072838187217712, \"percentile_inc_nulls\": 0.8073280453681946, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7944011092185974, \"percentile_inc_nulls\": 0.7944482564926147, \"value_count\": 786, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7821412086486816, \"percentile_inc_nulls\": 0.7821912169456482, \"value_count\": 748, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7701107859611511, \"percentile_inc_nulls\": 0.7701635360717773, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7582606673240662, \"percentile_inc_nulls\": 0.758316159248352, \"value_count\": 723, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 723.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7466564178466797, \"percentile_inc_nulls\": 0.7467144727706909, \"value_count\": 708, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 708.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7364616394042969, \"percentile_inc_nulls\": 0.7365221381187439, \"value_count\": 622, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 622.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7267422676086426, \"percentile_inc_nulls\": 0.7268049716949463, \"value_count\": 593, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 593.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7170720100402832, \"percentile_inc_nulls\": 0.7171369791030884, \"value_count\": 590, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7074673771858215, \"percentile_inc_nulls\": 0.7075344920158386, \"value_count\": 586, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6978955268859863, \"percentile_inc_nulls\": 0.6979647874832153, \"value_count\": 584, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.688372790813446, \"percentile_inc_nulls\": 0.6884442567825317, \"value_count\": 581, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6793090105056763, \"percentile_inc_nulls\": 0.6793825626373291, \"value_count\": 553, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 553.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6723759174346924, \"percentile_inc_nulls\": 0.6724510788917542, \"value_count\": 423, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6655576229095459, \"percentile_inc_nulls\": 0.6656343340873718, \"value_count\": 416, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6590179204940796, \"percentile_inc_nulls\": 0.6590961217880249, \"value_count\": 399, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6526912450790405, \"percentile_inc_nulls\": 0.6527709364891052, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6471513509750366, \"percentile_inc_nulls\": 0.6472322940826416, \"value_count\": 338, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6416934728622437, \"percentile_inc_nulls\": 0.6417756080627441, \"value_count\": 333, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6312692165374756, \"percentile_inc_nulls\": 0.6313538551330566, \"value_count\": 318, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.626384973526001, \"percentile_inc_nulls\": 0.626470685005188, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6217301487922668, \"percentile_inc_nulls\": 0.6218169331550598, \"value_count\": 284, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.612518846988678, \"percentile_inc_nulls\": 0.6126077175140381, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6079951524734497, \"percentile_inc_nulls\": 0.608085036277771, \"value_count\": 276, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.603569746017456, \"percentile_inc_nulls\": 0.6036607027053833, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5992263555526733, \"percentile_inc_nulls\": 0.5993183255195618, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5948993563652039, \"percentile_inc_nulls\": 0.5949922800064087, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5908181667327881, \"percentile_inc_nulls\": 0.5909121036529541, \"value_count\": 249, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5869500637054443, \"percentile_inc_nulls\": 0.5870448350906372, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5831148028373718, \"percentile_inc_nulls\": 0.5832104682922363, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.575706422328949, \"percentile_inc_nulls\": 0.5758037567138672, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5720186233520508, \"percentile_inc_nulls\": 0.5721167922019958, \"value_count\": 225, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5685439109802246, \"percentile_inc_nulls\": 0.5686428546905518, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5616599917411804, \"percentile_inc_nulls\": 0.5617605447769165, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.558234453201294, \"percentile_inc_nulls\": 0.5583357810974121, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5548253059387207, \"percentile_inc_nulls\": 0.554927408695221, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5514816641807556, \"percentile_inc_nulls\": 0.5515846014022827, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5482856035232544, \"percentile_inc_nulls\": 0.548389196395874, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5451222658157349, \"percentile_inc_nulls\": 0.5452266335487366, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5419917106628418, \"percentile_inc_nulls\": 0.5420968532562256, \"value_count\": 191, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5389431715011597, \"percentile_inc_nulls\": 0.5390489101409912, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5359601378440857, \"percentile_inc_nulls\": 0.5360665917396545, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.532993495464325, \"percentile_inc_nulls\": 0.5331006050109863, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5302727222442627, \"percentile_inc_nulls\": 0.5303804874420166, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5275847315788269, \"percentile_inc_nulls\": 0.5276931524276733, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5249786972999573, \"percentile_inc_nulls\": 0.5250876545906067, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5224218368530273, \"percentile_inc_nulls\": 0.5225313901901245, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5198976993560791, \"percentile_inc_nulls\": 0.520007848739624, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5173900127410889, \"percentile_inc_nulls\": 0.5175007581710815, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5148987174034119, \"percentile_inc_nulls\": 0.5150099992752075, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5124237537384033, \"percentile_inc_nulls\": 0.5125356316566467, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5099652409553528, \"percentile_inc_nulls\": 0.510077714920044, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5075395107269287, \"percentile_inc_nulls\": 0.5076524615287781, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5051465034484863, \"percentile_inc_nulls\": 0.5052600502967834, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4980167746543884, \"percentile_inc_nulls\": 0.4981319308280945, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48621582984924316, \"percentile_inc_nulls\": 0.4863336682319641, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4838883876800537, \"percentile_inc_nulls\": 0.4840068221092224, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48161017894744873, \"percentile_inc_nulls\": 0.48172909021377563, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47936469316482544, \"percentile_inc_nulls\": 0.4794841408729553, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47713565826416016, \"percentile_inc_nulls\": 0.47725558280944824, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4749557375907898, \"percentile_inc_nulls\": 0.47507619857788086, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47279220819473267, \"percentile_inc_nulls\": 0.4729132056236267, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4706615209579468, \"percentile_inc_nulls\": 0.47078293561935425, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4685635566711426, \"percentile_inc_nulls\": 0.468685507774353, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4664819836616516, \"percentile_inc_nulls\": 0.46660441160202026, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4644496440887451, \"percentile_inc_nulls\": 0.4645724892616272, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4624336361885071, \"percentile_inc_nulls\": 0.46255695819854736, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4604504108428955, \"percentile_inc_nulls\": 0.460574209690094, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45848357677459717, \"percentile_inc_nulls\": 0.4586077928543091, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45458269119262695, \"percentile_inc_nulls\": 0.4547078013420105, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4526650309562683, \"percentile_inc_nulls\": 0.45279061794281006, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45078015327453613, \"percentile_inc_nulls\": 0.4509061574935913, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.445223867893219, \"percentile_inc_nulls\": 0.44535118341445923, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4416508078575134, \"percentile_inc_nulls\": 0.44177889823913574, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4398806691169739, \"percentile_inc_nulls\": 0.4400091767311096, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43812692165374756, \"percentile_inc_nulls\": 0.43825584650039673, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43468499183654785, \"percentile_inc_nulls\": 0.4348146915435791, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4314069151878357, \"percentile_inc_nulls\": 0.431537389755249, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42978429794311523, \"percentile_inc_nulls\": 0.4299151301383972, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.428178071975708, \"percentile_inc_nulls\": 0.42830926179885864, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42499834299087524, \"percentile_inc_nulls\": 0.4251302480697632, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4234248995780945, \"percentile_inc_nulls\": 0.4235571622848511, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4218842387199402, \"percentile_inc_nulls\": 0.42201685905456543, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4203763008117676, \"percentile_inc_nulls\": 0.4205092787742615, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.418884813785553, \"percentile_inc_nulls\": 0.4190181493759155, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4145086407661438, \"percentile_inc_nulls\": 0.41464293003082275, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4131154417991638, \"percentile_inc_nulls\": 0.4132500886917114, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4103618860244751, \"percentile_inc_nulls\": 0.41049718856811523, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40764111280441284, \"percentile_inc_nulls\": 0.40777701139450073, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40495312213897705, \"percentile_inc_nulls\": 0.4050896167755127, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4022979140281677, \"percentile_inc_nulls\": 0.4024350047111511, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4009866714477539, \"percentile_inc_nulls\": 0.40112411975860596, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3983970284461975, \"percentile_inc_nulls\": 0.3985350728034973, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3971513509750366, \"percentile_inc_nulls\": 0.3972896933555603, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3934635519981384, \"percentile_inc_nulls\": 0.39360272884368896, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.39103782176971436, \"percentile_inc_nulls\": 0.39117753505706787, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3863174319267273, \"percentile_inc_nulls\": 0.38645821809768677, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38515371084213257, \"percentile_inc_nulls\": 0.3852947950363159, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38400644063949585, \"percentile_inc_nulls\": 0.3841477632522583, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38061362504959106, \"percentile_inc_nulls\": 0.3807557225227356, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3783845901489258, \"percentile_inc_nulls\": 0.3785271644592285, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3773028254508972, \"percentile_inc_nulls\": 0.37744569778442383, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3741067051887512, \"percentile_inc_nulls\": 0.37425029277801514, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3730577826499939, \"percentile_inc_nulls\": 0.3732016086578369, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.37202519178390503, \"percentile_inc_nulls\": 0.37216925621032715, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3710089921951294, \"percentile_inc_nulls\": 0.3711532950401306, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3690093755722046, \"percentile_inc_nulls\": 0.369154155254364, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3660591244697571, \"percentile_inc_nulls\": 0.36620455980300903, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.36130595207214355, \"percentile_inc_nulls\": 0.36145251989364624, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3594374656677246, \"percentile_inc_nulls\": 0.3595844507217407, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35851961374282837, \"percentile_inc_nulls\": 0.3586667776107788, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35671669244766235, \"percentile_inc_nulls\": 0.3568642735481262, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3549465537071228, \"percentile_inc_nulls\": 0.3550945520401001, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35147184133529663, \"percentile_inc_nulls\": 0.351620614528656, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3489149808883667, \"percentile_inc_nulls\": 0.34906435012817383, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14 values (0.0%) are null and there are 5121 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 6759, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1467, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1024, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 854, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 840, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 814, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 786, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 748, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 734, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 723, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"shoreham\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"allston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"tainan city\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"airport city\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"great neck,\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 6759]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9869236350059509, \"percentile_inc_nulls\": 0.9869236350059509, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9778782725334167, \"percentile_inc_nulls\": 0.9778782725334167, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9721266627311707, \"percentile_inc_nulls\": 0.9721266627311707, \"value_count\": 351, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9664405584335327, \"percentile_inc_nulls\": 0.9664405584335327, \"value_count\": 347, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9614754319190979, \"percentile_inc_nulls\": 0.9614754319190979, \"value_count\": 303, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9569363594055176, \"percentile_inc_nulls\": 0.9569363594055176, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9525775909423828, \"percentile_inc_nulls\": 0.9525775909423828, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9486120939254761, \"percentile_inc_nulls\": 0.9486120939254761, \"value_count\": 242, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9448267817497253, \"percentile_inc_nulls\": 0.9448267817497253, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9411562085151672, \"percentile_inc_nulls\": 0.9411562085151672, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.937616765499115, \"percentile_inc_nulls\": 0.937616765499115, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9341428279876709, \"percentile_inc_nulls\": 0.9341428279876709, \"value_count\": 212, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.930881917476654, \"percentile_inc_nulls\": 0.930881917476654, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9276373982429504, \"percentile_inc_nulls\": 0.9276373982429504, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9246222972869873, \"percentile_inc_nulls\": 0.9246222972869873, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9217710494995117, \"percentile_inc_nulls\": 0.9217710494995117, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9162651896476746, \"percentile_inc_nulls\": 0.9162651896476746, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9136925339698792, \"percentile_inc_nulls\": 0.9136925339698792, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9112181663513184, \"percentile_inc_nulls\": 0.9112181663513184, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9088093638420105, \"percentile_inc_nulls\": 0.9088093638420105, \"value_count\": 147, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9065152406692505, \"percentile_inc_nulls\": 0.9065152406692505, \"value_count\": 140, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9042539000511169, \"percentile_inc_nulls\": 0.9042539000511169, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9021236896514893, \"percentile_inc_nulls\": 0.9021236896514893, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9001245498657227, \"percentile_inc_nulls\": 0.9001245498657227, \"value_count\": 122, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8982564806938171, \"percentile_inc_nulls\": 0.8982564806938171, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8964048027992249, \"percentile_inc_nulls\": 0.8964048027992249, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8946186900138855, \"percentile_inc_nulls\": 0.8946186900138855, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8929636478424072, \"percentile_inc_nulls\": 0.8929636478424072, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8913413882255554, \"percentile_inc_nulls\": 0.8913413882255554, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8897355198860168, \"percentile_inc_nulls\": 0.8897355198860168, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8866220712661743, \"percentile_inc_nulls\": 0.8866220712661743, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8850981593132019, \"percentile_inc_nulls\": 0.8850981593132019, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8836561441421509, \"percentile_inc_nulls\": 0.8836561441421509, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8822305202484131, \"percentile_inc_nulls\": 0.8822305202484131, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8810015320777893, \"percentile_inc_nulls\": 0.8810015320777893, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8798708915710449, \"percentile_inc_nulls\": 0.8798708915710449, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.877707839012146, \"percentile_inc_nulls\": 0.877707839012146, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8766427636146545, \"percentile_inc_nulls\": 0.8766427636146545, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8756431937217712, \"percentile_inc_nulls\": 0.8756431937217712, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8746927380561829, \"percentile_inc_nulls\": 0.8746927380561829, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8737751245498657, \"percentile_inc_nulls\": 0.8737751245498657, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8728902339935303, \"percentile_inc_nulls\": 0.8728902339935303, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8711532950401306, \"percentile_inc_nulls\": 0.8711532950401306, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8703175783157349, \"percentile_inc_nulls\": 0.8703175783157349, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8686789274215698, \"percentile_inc_nulls\": 0.8686789274215698, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8678759932518005, \"percentile_inc_nulls\": 0.8678759932518005, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.866401195526123, \"percentile_inc_nulls\": 0.866401195526123, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8656966090202332, \"percentile_inc_nulls\": 0.8656966090202332, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8650083541870117, \"percentile_inc_nulls\": 0.8650083541870117, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8643528819084167, \"percentile_inc_nulls\": 0.8643528819084167, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.863074779510498, \"percentile_inc_nulls\": 0.863074779510498, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.861829400062561, \"percentile_inc_nulls\": 0.861829400062561, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8600596189498901, \"percentile_inc_nulls\": 0.8600596189498901, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8589125871658325, \"percentile_inc_nulls\": 0.8589125871658325, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.857241153717041, \"percentile_inc_nulls\": 0.857241153717041, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.855618953704834, \"percentile_inc_nulls\": 0.855618953704834, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8545701503753662, \"percentile_inc_nulls\": 0.8545701503753662, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8530462384223938, \"percentile_inc_nulls\": 0.8530462384223938, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8525546789169312, \"percentile_inc_nulls\": 0.8525546789169312, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8511290550231934, \"percentile_inc_nulls\": 0.8511290550231934, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8492937088012695, \"percentile_inc_nulls\": 0.8492937088012695, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8470815420150757, \"percentile_inc_nulls\": 0.8470815420150757, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8453773856163025, \"percentile_inc_nulls\": 0.8453773856163025, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8408710956573486, \"percentile_inc_nulls\": 0.8408710956573486, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8389047384262085, \"percentile_inc_nulls\": 0.8389047384262085, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8373971581459045, \"percentile_inc_nulls\": 0.8373971581459045, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8363156318664551, \"percentile_inc_nulls\": 0.8363156318664551, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8345950841903687, \"percentile_inc_nulls\": 0.8345950841903687, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8319732546806335, \"percentile_inc_nulls\": 0.8319732546806335, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.828859806060791, \"percentile_inc_nulls\": 0.828859806060791, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8262052536010742, \"percentile_inc_nulls\": 0.8262052536010742, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8250909447669983, \"percentile_inc_nulls\": 0.8250909447669983, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8222069144248962, \"percentile_inc_nulls\": 0.8222069144248962, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8190115690231323, \"percentile_inc_nulls\": 0.8190115690231323, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8148821592330933, \"percentile_inc_nulls\": 0.8148821592330933, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8112607598304749, \"percentile_inc_nulls\": 0.8112607598304749, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8055583238601685, \"percentile_inc_nulls\": 0.8055583238601685, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8001507520675659, \"percentile_inc_nulls\": 0.8001507520675659, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7937600612640381, \"percentile_inc_nulls\": 0.7937600612640381, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7862386703491211, \"percentile_inc_nulls\": 0.7862386703491211, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 459.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7773244380950928, \"percentile_inc_nulls\": 0.7773244380950928, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.767918586730957, \"percentile_inc_nulls\": 0.767918586730957, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 574.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.754645586013794, \"percentile_inc_nulls\": 0.754645586013794, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7358830571174622, \"percentile_inc_nulls\": 0.7358830571174622, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1145.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7013404369354248, \"percentile_inc_nulls\": 0.7013404369354248, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.6384655833244324, \"percentile_inc_nulls\": 0.6384655833244324, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3837.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.4765673875808716, \"percentile_inc_nulls\": 0.4765673875808716, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 9880.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29083.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 36703 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 351, \"group_name\": \"_street_address_\", \"value\": \"11 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 347, \"group_name\": \"_street_address_\", \"value\": \"383 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 303, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lk blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"85 broad st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 242, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"co wilmington trust company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial ctr floor 10\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"101 east kennedy blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"7505 floyd ct\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"5972 ne 4th ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"har hotzvim 13 hartom st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"3133 west frye rd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9997377991676331, \"percentile_inc_nulls\": 0.9997377991676331, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9989348649978638, \"percentile_inc_nulls\": 0.9989348649978638, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9960836172103882, \"percentile_inc_nulls\": 0.9960836172103882, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9845311641693115, \"percentile_inc_nulls\": 0.9845311641693115, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 705.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.9480876922607422, \"percentile_inc_nulls\": 0.9480876922607422, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2224.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.8398387432098389, \"percentile_inc_nulls\": 0.8398387432098389, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6606.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.5750991106033325, \"percentile_inc_nulls\": 0.5750991106033325, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16156.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35096.0, \"distinct_value_count\": 46111}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 46111 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"comprehensive care corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 8, \"group_name\": \"_company_name_\", \"value\": \"la jolla pharmaceutical company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"dycom industries incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"comerica inc /new/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"omega healthcare investors incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"softech incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"camelot corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"empire petroleum corporation\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 7, \"group_name\": \"_company_name_\", \"value\": \"stillwater mining co /de/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 6, \"group_name\": \"_company_name_\", \"value\": \"brandywine realty trust\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"cnl strategic capital limited liability company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk36\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"elite pharmaceuticals inc /nv/\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"bank 2021 bnk38\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"mosaic immunoengineering incorporated\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 46111}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 8]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.028595805168151855, \"percentile_inc_nulls\": 0.02920067310333252, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.026972532272338867, \"percentile_inc_nulls\": 0.027578413486480713, \"value_count\": 99, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.02552962303161621, \"percentile_inc_nulls\": 0.026136398315429688, \"value_count\": 88, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.022938966751098633, \"percentile_inc_nulls\": 0.023547351360321045, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.021774768829345703, \"percentile_inc_nulls\": 0.022383928298950195, \"value_count\": 71, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.020659804344177246, \"percentile_inc_nulls\": 0.021269619464874268, \"value_count\": 68, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01956123113632202, \"percentile_inc_nulls\": 0.020171701908111572, \"value_count\": 67, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 67.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.018577396869659424, \"percentile_inc_nulls\": 0.019188523292541504, \"value_count\": 60, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.017626404762268066, \"percentile_inc_nulls\": 0.0182381272315979, \"value_count\": 58, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.015757203102111816, \"percentile_inc_nulls\": 0.016370058059692383, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014838993549346924, \"percentile_inc_nulls\": 0.015452444553375244, \"value_count\": 56, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.014019131660461426, \"percentile_inc_nulls\": 0.014633119106292725, \"value_count\": 50, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0132485032081604, \"percentile_inc_nulls\": 0.013862967491149902, \"value_count\": 47, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.012494266033172607, \"percentile_inc_nulls\": 0.013109147548675537, \"value_count\": 46, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.011789202690124512, \"percentile_inc_nulls\": 0.012404561042785645, \"value_count\": 43, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.01121532917022705, \"percentile_inc_nulls\": 0.011831045150756836, \"value_count\": 35, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 35.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010674238204956055, \"percentile_inc_nulls\": 0.011290252208709717, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.010149538516998291, \"percentile_inc_nulls\": 0.010765910148620605, \"value_count\": 32, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009674012660980225, \"percentile_inc_nulls\": 0.010290682315826416, \"value_count\": 29, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.009264111518859863, \"percentile_inc_nulls\": 0.009881019592285156, \"value_count\": 25, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 25.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008870601654052734, \"percentile_inc_nulls\": 0.009487748146057129, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.008181929588317871, \"percentile_inc_nulls\": 0.008799552917480469, \"value_count\": 21, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007870376110076904, \"percentile_inc_nulls\": 0.008488178253173828, \"value_count\": 19, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 19.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007575273513793945, \"percentile_inc_nulls\": 0.008193254470825195, \"value_count\": 18, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.007050573825836182, \"percentile_inc_nulls\": 0.007668852806091309, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 32.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00680464506149292, \"percentile_inc_nulls\": 0.007423043251037598, \"value_count\": 15, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.006115972995758057, \"percentile_inc_nulls\": 0.0067348480224609375, \"value_count\": 14, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00590282678604126, \"percentile_inc_nulls\": 0.006521821022033691, \"value_count\": 13, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.00570601224899292, \"percentile_inc_nulls\": 0.006325185298919678, \"value_count\": 12, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.005164921283721924, \"percentile_inc_nulls\": 0.005784392356872559, \"value_count\": 11, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0041811466217041016, \"percentile_inc_nulls\": 0.00480121374130249, \"value_count\": 10, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003885984420776367, \"percentile_inc_nulls\": 0.004506289958953857, \"value_count\": 9, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.003623664379119873, \"percentile_inc_nulls\": 0.004244089126586914, \"value_count\": 8, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0035088658332824707, \"percentile_inc_nulls\": 0.0041294097900390625, \"value_count\": 7, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 7.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.002721846103668213, \"percentile_inc_nulls\": 0.003342866897583008, \"value_count\": 6, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0019840002059936523, \"percentile_inc_nulls\": 0.002605438232421875, \"value_count\": 5, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0012625455856323242, \"percentile_inc_nulls\": 0.00188446044921875, \"value_count\": 4, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0007214546203613281, \"percentile_inc_nulls\": 0.0013436675071716309, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0003935098648071289, \"percentile_inc_nulls\": 0.0010159611701965332, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 20.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0006226897239685059, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.8305076360702515, \"percentile_inc_nulls\": 0.8306131958961487, \"value_count\": 10337, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 10337.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.691791832447052, \"percentile_inc_nulls\": 0.6919837594032288, \"value_count\": 8460, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 8460.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.6141864061355591, \"percentile_inc_nulls\": 0.6144266128540039, \"value_count\": 4733, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 4733.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.563307523727417, \"percentile_inc_nulls\": 0.5635794401168823, \"value_count\": 3103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.5253000259399414, \"percentile_inc_nulls\": 0.5255956649780273, \"value_count\": 2318, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2318.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.49045711755752563, \"percentile_inc_nulls\": 0.4907744526863098, \"value_count\": 2125, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2125.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4584508538246155, \"percentile_inc_nulls\": 0.45878803730010986, \"value_count\": 1952, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1952.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.42987143993377686, \"percentile_inc_nulls\": 0.4302264451980591, \"value_count\": 1743, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1743.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.4037187695503235, \"percentile_inc_nulls\": 0.4040900468826294, \"value_count\": 1595, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1595.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3788614273071289, \"percentile_inc_nulls\": 0.3792482018470764, \"value_count\": 1516, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1516.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.35418444871902466, \"percentile_inc_nulls\": 0.35458654165267944, \"value_count\": 1505, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1505.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.33116352558135986, \"percentile_inc_nulls\": 0.3315799832344055, \"value_count\": 1404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.3082737326622009, \"percentile_inc_nulls\": 0.3087044954299927, \"value_count\": 1396, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1396.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.2873516082763672, \"percentile_inc_nulls\": 0.28779536485671997, \"value_count\": 1276, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1276.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.26756083965301514, \"percentile_inc_nulls\": 0.2680169343948364, \"value_count\": 1207, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1207.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.24914735555648804, \"percentile_inc_nulls\": 0.2496148943901062, \"value_count\": 1123, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1123.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.23243916034698486, \"percentile_inc_nulls\": 0.23291712999343872, \"value_count\": 1019, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1019.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.21587854623794556, \"percentile_inc_nulls\": 0.21636676788330078, \"value_count\": 1010, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1010.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.20199054479599, \"percentile_inc_nulls\": 0.20248746871948242, \"value_count\": 847, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 847.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.18875843286514282, \"percentile_inc_nulls\": 0.18926358222961426, \"value_count\": 807, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 807.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.17613303661346436, \"percentile_inc_nulls\": 0.17664599418640137, \"value_count\": 770, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 770.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.1640486717224121, \"percentile_inc_nulls\": 0.1645691990852356, \"value_count\": 737, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 737.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.15265297889709473, \"percentile_inc_nulls\": 0.15318059921264648, \"value_count\": 695, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 695.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.14209353923797607, \"percentile_inc_nulls\": 0.14262771606445312, \"value_count\": 644, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 644.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.13258343935012817, \"percentile_inc_nulls\": 0.13312357664108276, \"value_count\": 580, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 580.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.12354886531829834, \"percentile_inc_nulls\": 0.12409466505050659, \"value_count\": 551, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 551.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.11457991600036621, \"percentile_inc_nulls\": 0.11513125896453857, \"value_count\": 547, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 547.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10720139741897583, \"percentile_inc_nulls\": 0.1077573299407959, \"value_count\": 450, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 450.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.10013443231582642, \"percentile_inc_nulls\": 0.10069477558135986, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0935102105140686, \"percentile_inc_nulls\": 0.09407466650009155, \"value_count\": 404, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 404.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08788615465164185, \"percentile_inc_nulls\": 0.08845412731170654, \"value_count\": 343, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 343.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.08291792869567871, \"percentile_inc_nulls\": 0.08348900079727173, \"value_count\": 303, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07842528820037842, \"percentile_inc_nulls\": 0.0789991021156311, \"value_count\": 274, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 274.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07416212558746338, \"percentile_inc_nulls\": 0.07473862171173096, \"value_count\": 260, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 260.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.07007938623428345, \"percentile_inc_nulls\": 0.07065838575363159, \"value_count\": 249, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.06240570545196533, \"percentile_inc_nulls\": 0.06298953294754028, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.0587000846862793, \"percentile_inc_nulls\": 0.05928617715835571, \"value_count\": 226, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05512559413909912, \"percentile_inc_nulls\": 0.055713951587677, \"value_count\": 218, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.05199384689331055, \"percentile_inc_nulls\": 0.052584171295166016, \"value_count\": 191, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.048944056034088135, \"percentile_inc_nulls\": 0.049536287784576416, \"value_count\": 186, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04597628116607666, \"percentile_inc_nulls\": 0.0465703010559082, \"value_count\": 181, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.04348397254943848, \"percentile_inc_nulls\": 0.044079601764678955, \"value_count\": 152, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.041270434856414795, \"percentile_inc_nulls\": 0.04186737537384033, \"value_count\": 135, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03918802738189697, \"percentile_inc_nulls\": 0.039786338806152344, \"value_count\": 127, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.037335216999053955, \"percentile_inc_nulls\": 0.03793466091156006, \"value_count\": 113, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03549879789352417, \"percentile_inc_nulls\": 0.036099374294281006, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03369516134262085, \"percentile_inc_nulls\": 0.03429687023162842, \"value_count\": 110, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03195708990097046, \"percentile_inc_nulls\": 0.032559871673583984, \"value_count\": 106, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 0.03026825189590454, \"percentile_inc_nulls\": 0.030872106552124023, \"value_count\": 103, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 103.0, \"distinct_value_count\": 172}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 102, \"group_name\": \"_state_\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 38 values (0.1%) are null and there are 172 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 10337, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 8460, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 4733, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 3103, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2318, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 2125, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1952, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1743, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1595, \"group_name\": \"_state_\", \"value\": \"md\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1516, \"group_name\": \"_state_\", \"value\": \"nv\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"j1\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"w5\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"s9\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"a7\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"z2\", \"total_non_null_rows\": 60988, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 172}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 10337]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.5075395107269287, \"percentile_inc_nulls\": 0.5076524615287781, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5051465034484863, \"percentile_inc_nulls\": 0.5052600502967834, \"value_count\": 146, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4980167746543884, \"percentile_inc_nulls\": 0.4981319308280945, \"value_count\": 145, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 435.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48621582984924316, \"percentile_inc_nulls\": 0.4863336682319641, \"value_count\": 144, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 720.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4838883876800537, \"percentile_inc_nulls\": 0.4840068221092224, \"value_count\": 142, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 142.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.48161017894744873, \"percentile_inc_nulls\": 0.48172909021377563, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47936469316482544, \"percentile_inc_nulls\": 0.4794841408729553, \"value_count\": 137, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47713565826416016, \"percentile_inc_nulls\": 0.47725558280944824, \"value_count\": 136, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4749557375907898, \"percentile_inc_nulls\": 0.47507619857788086, \"value_count\": 133, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 133.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.47279220819473267, \"percentile_inc_nulls\": 0.4729132056236267, \"value_count\": 132, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4706615209579468, \"percentile_inc_nulls\": 0.47078293561935425, \"value_count\": 130, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4685635566711426, \"percentile_inc_nulls\": 0.468685507774353, \"value_count\": 128, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4664819836616516, \"percentile_inc_nulls\": 0.46660441160202026, \"value_count\": 127, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 127.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4644496440887451, \"percentile_inc_nulls\": 0.4645724892616272, \"value_count\": 124, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 124.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4624336361885071, \"percentile_inc_nulls\": 0.46255695819854736, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4604504108428955, \"percentile_inc_nulls\": 0.460574209690094, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45848357677459717, \"percentile_inc_nulls\": 0.4586077928543091, \"value_count\": 120, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45458269119262695, \"percentile_inc_nulls\": 0.4547078013420105, \"value_count\": 119, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 238.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4526650309562683, \"percentile_inc_nulls\": 0.45279061794281006, \"value_count\": 117, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 117.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.45078015327453613, \"percentile_inc_nulls\": 0.4509061574935913, \"value_count\": 115, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 115.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.445223867893219, \"percentile_inc_nulls\": 0.44535118341445923, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4416508078575134, \"percentile_inc_nulls\": 0.44177889823913574, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 218.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4398806691169739, \"percentile_inc_nulls\": 0.4400091767311096, \"value_count\": 108, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43812692165374756, \"percentile_inc_nulls\": 0.43825584650039673, \"value_count\": 107, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 107.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.43468499183654785, \"percentile_inc_nulls\": 0.4348146915435791, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4314069151878357, \"percentile_inc_nulls\": 0.431537389755249, \"value_count\": 100, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42978429794311523, \"percentile_inc_nulls\": 0.4299151301383972, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.428178071975708, \"percentile_inc_nulls\": 0.42830926179885864, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.42499834299087524, \"percentile_inc_nulls\": 0.4251302480697632, \"value_count\": 97, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 194.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4234248995780945, \"percentile_inc_nulls\": 0.4235571622848511, \"value_count\": 96, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4218842387199402, \"percentile_inc_nulls\": 0.42201685905456543, \"value_count\": 94, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 94.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4203763008117676, \"percentile_inc_nulls\": 0.4205092787742615, \"value_count\": 92, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.418884813785553, \"percentile_inc_nulls\": 0.4190181493759155, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 91.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4145086407661438, \"percentile_inc_nulls\": 0.41464293003082275, \"value_count\": 89, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 267.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4131154417991638, \"percentile_inc_nulls\": 0.4132500886917114, \"value_count\": 85, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 85.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4103618860244751, \"percentile_inc_nulls\": 0.41049718856811523, \"value_count\": 84, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 168.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40764111280441284, \"percentile_inc_nulls\": 0.40777701139450073, \"value_count\": 83, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.40495312213897705, \"percentile_inc_nulls\": 0.4050896167755127, \"value_count\": 82, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4022979140281677, \"percentile_inc_nulls\": 0.4024350047111511, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.4009866714477539, \"percentile_inc_nulls\": 0.40112411975860596, \"value_count\": 80, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3983970284461975, \"percentile_inc_nulls\": 0.3985350728034973, \"value_count\": 79, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 158.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3971513509750366, \"percentile_inc_nulls\": 0.3972896933555603, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3934635519981384, \"percentile_inc_nulls\": 0.39360272884368896, \"value_count\": 75, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.39103782176971436, \"percentile_inc_nulls\": 0.39117753505706787, \"value_count\": 74, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3863174319267273, \"percentile_inc_nulls\": 0.38645821809768677, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 288.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38515371084213257, \"percentile_inc_nulls\": 0.3852947950363159, \"value_count\": 71, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 71.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38400644063949585, \"percentile_inc_nulls\": 0.3841477632522583, \"value_count\": 70, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.38061362504959106, \"percentile_inc_nulls\": 0.3807557225227356, \"value_count\": 69, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 207.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3783845901489258, \"percentile_inc_nulls\": 0.3785271644592285, \"value_count\": 68, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 136.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3773028254508972, \"percentile_inc_nulls\": 0.37744569778442383, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3741067051887512, \"percentile_inc_nulls\": 0.37425029277801514, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3730577826499939, \"percentile_inc_nulls\": 0.3732016086578369, \"value_count\": 64, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.37202519178390503, \"percentile_inc_nulls\": 0.37216925621032715, \"value_count\": 63, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3710089921951294, \"percentile_inc_nulls\": 0.3711532950401306, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3690093755722046, \"percentile_inc_nulls\": 0.369154155254364, \"value_count\": 61, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3660591244697571, \"percentile_inc_nulls\": 0.36620455980300903, \"value_count\": 60, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.36130595207214355, \"percentile_inc_nulls\": 0.36145251989364624, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8892185091972351, \"percentile_inc_nulls\": 0.8892439603805542, \"value_count\": 6759, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 6759.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8651740550994873, \"percentile_inc_nulls\": 0.8652049899101257, \"value_count\": 1467, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1467.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8483904600143433, \"percentile_inc_nulls\": 0.8484252691268921, \"value_count\": 1024, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1024.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8343932628631592, \"percentile_inc_nulls\": 0.8344312310218811, \"value_count\": 854, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 854.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8206254243850708, \"percentile_inc_nulls\": 0.8206666111946106, \"value_count\": 840, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.8072838187217712, \"percentile_inc_nulls\": 0.8073280453681946, \"value_count\": 814, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 814.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7944011092185974, \"percentile_inc_nulls\": 0.7944482564926147, \"value_count\": 786, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 786.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7821412086486816, \"percentile_inc_nulls\": 0.7821912169456482, \"value_count\": 748, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 748.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7701107859611511, \"percentile_inc_nulls\": 0.7701635360717773, \"value_count\": 734, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 734.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7582606673240662, \"percentile_inc_nulls\": 0.758316159248352, \"value_count\": 723, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 723.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7466564178466797, \"percentile_inc_nulls\": 0.7467144727706909, \"value_count\": 708, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 708.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7364616394042969, \"percentile_inc_nulls\": 0.7365221381187439, \"value_count\": 622, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 622.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7267422676086426, \"percentile_inc_nulls\": 0.7268049716949463, \"value_count\": 593, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 593.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7170720100402832, \"percentile_inc_nulls\": 0.7171369791030884, \"value_count\": 590, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 590.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.7074673771858215, \"percentile_inc_nulls\": 0.7075344920158386, \"value_count\": 586, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 586.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6978955268859863, \"percentile_inc_nulls\": 0.6979647874832153, \"value_count\": 584, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 584.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.688372790813446, \"percentile_inc_nulls\": 0.6884442567825317, \"value_count\": 581, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6793090105056763, \"percentile_inc_nulls\": 0.6793825626373291, \"value_count\": 553, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 553.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6723759174346924, \"percentile_inc_nulls\": 0.6724510788917542, \"value_count\": 423, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 423.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6655576229095459, \"percentile_inc_nulls\": 0.6656343340873718, \"value_count\": 416, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 416.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6590179204940796, \"percentile_inc_nulls\": 0.6590961217880249, \"value_count\": 399, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6526912450790405, \"percentile_inc_nulls\": 0.6527709364891052, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6471513509750366, \"percentile_inc_nulls\": 0.6472322940826416, \"value_count\": 338, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6416934728622437, \"percentile_inc_nulls\": 0.6417756080627441, \"value_count\": 333, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 333.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6312692165374756, \"percentile_inc_nulls\": 0.6313538551330566, \"value_count\": 318, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 636.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.626384973526001, \"percentile_inc_nulls\": 0.626470685005188, \"value_count\": 298, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 298.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6217301487922668, \"percentile_inc_nulls\": 0.6218169331550598, \"value_count\": 284, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.612518846988678, \"percentile_inc_nulls\": 0.6126077175140381, \"value_count\": 281, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 562.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.6079951524734497, \"percentile_inc_nulls\": 0.608085036277771, \"value_count\": 276, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.603569746017456, \"percentile_inc_nulls\": 0.6036607027053833, \"value_count\": 270, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 270.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5992263555526733, \"percentile_inc_nulls\": 0.5993183255195618, \"value_count\": 265, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 265.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5948993563652039, \"percentile_inc_nulls\": 0.5949922800064087, \"value_count\": 264, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 264.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5908181667327881, \"percentile_inc_nulls\": 0.5909121036529541, \"value_count\": 249, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 249.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5869500637054443, \"percentile_inc_nulls\": 0.5870448350906372, \"value_count\": 236, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 236.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5831148028373718, \"percentile_inc_nulls\": 0.5832104682922363, \"value_count\": 234, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.575706422328949, \"percentile_inc_nulls\": 0.5758037567138672, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 452.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5720186233520508, \"percentile_inc_nulls\": 0.5721167922019958, \"value_count\": 225, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 225.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5685439109802246, \"percentile_inc_nulls\": 0.5686428546905518, \"value_count\": 212, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5616599917411804, \"percentile_inc_nulls\": 0.5617605447769165, \"value_count\": 210, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 420.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.558234453201294, \"percentile_inc_nulls\": 0.5583357810974121, \"value_count\": 209, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 209.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5548253059387207, \"percentile_inc_nulls\": 0.554927408695221, \"value_count\": 208, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 208.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5514816641807556, \"percentile_inc_nulls\": 0.5515846014022827, \"value_count\": 204, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5482856035232544, \"percentile_inc_nulls\": 0.548389196395874, \"value_count\": 195, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5451222658157349, \"percentile_inc_nulls\": 0.5452266335487366, \"value_count\": 193, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 193.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5419917106628418, \"percentile_inc_nulls\": 0.5420968532562256, \"value_count\": 191, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 191.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5389431715011597, \"percentile_inc_nulls\": 0.5390489101409912, \"value_count\": 186, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 186.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5359601378440857, \"percentile_inc_nulls\": 0.5360665917396545, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.532993495464325, \"percentile_inc_nulls\": 0.5331006050109863, \"value_count\": 181, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 181.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5302727222442627, \"percentile_inc_nulls\": 0.5303804874420166, \"value_count\": 166, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 166.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5275847315788269, \"percentile_inc_nulls\": 0.5276931524276733, \"value_count\": 164, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 164.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5249786972999573, \"percentile_inc_nulls\": 0.5250876545906067, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5224218368530273, \"percentile_inc_nulls\": 0.5225313901901245, \"value_count\": 156, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5198976993560791, \"percentile_inc_nulls\": 0.520007848739624, \"value_count\": 154, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 154.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5173900127410889, \"percentile_inc_nulls\": 0.5175007581710815, \"value_count\": 153, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 153.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5148987174034119, \"percentile_inc_nulls\": 0.5150099992752075, \"value_count\": 152, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5124237537384033, \"percentile_inc_nulls\": 0.5125356316566467, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.5099652409553528, \"percentile_inc_nulls\": 0.510077714920044, \"value_count\": 150, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3594374656677246, \"percentile_inc_nulls\": 0.3595844507217407, \"value_count\": 57, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35851961374282837, \"percentile_inc_nulls\": 0.3586667776107788, \"value_count\": 56, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35671669244766235, \"percentile_inc_nulls\": 0.3568642735481262, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3549465537071228, \"percentile_inc_nulls\": 0.3550945520401001, \"value_count\": 54, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.35147184133529663, \"percentile_inc_nulls\": 0.351620614528656, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3489149808883667, \"percentile_inc_nulls\": 0.34906435012817383, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 156.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.34724318981170654, \"percentile_inc_nulls\": 0.3473929166793823, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3439651131629944, \"percentile_inc_nulls\": 0.34411561489105225, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33994948863983154, \"percentile_inc_nulls\": 0.3401009440422058, \"value_count\": 49, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 245.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3344423770904541, \"percentile_inc_nulls\": 0.33459508419036865, \"value_count\": 48, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.33213138580322266, \"percentile_inc_nulls\": 0.3322846293449402, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3291155695915222, \"percentile_inc_nulls\": 0.3292694687843323, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3261653184890747, \"percentile_inc_nulls\": 0.32631993293762207, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3211171627044678, \"percentile_inc_nulls\": 0.32127290964126587, \"value_count\": 44, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 308.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3168884515762329, \"percentile_inc_nulls\": 0.3170452117919922, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 258.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.31482332944869995, \"percentile_inc_nulls\": 0.31498050689697266, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 126.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3101193308830261, \"percentile_inc_nulls\": 0.3102775812149048, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 287.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30749690532684326, \"percentile_inc_nulls\": 0.3076557517051697, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.3030223846435547, \"percentile_inc_nulls\": 0.3031822443008423, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.30115389823913574, \"percentile_inc_nulls\": 0.30131417512893677, \"value_count\": 38, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2987281084060669, \"percentile_inc_nulls\": 0.2988889813423157, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2957778573036194, \"percentile_inc_nulls\": 0.29593944549560547, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 180.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.29348325729370117, \"percentile_inc_nulls\": 0.29364532232284546, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2884678244590759, \"percentile_inc_nulls\": 0.2886310815811157, \"value_count\": 34, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.285222589969635, \"percentile_inc_nulls\": 0.2853865623474121, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2831246256828308, \"percentile_inc_nulls\": 0.2832890748977661, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2770274877548218, \"percentile_inc_nulls\": 0.2771933078765869, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2735854983329773, \"percentile_inc_nulls\": 0.2737521529197693, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2726348638534546, \"percentile_inc_nulls\": 0.2728017568588257, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26942241191864014, \"percentile_inc_nulls\": 0.26959002017974854, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26632463932037354, \"percentile_inc_nulls\": 0.26649296283721924, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.26078474521636963, \"percentile_inc_nulls\": 0.2609543204307556, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 338.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.25586771965026855, \"percentile_inc_nulls\": 0.2560384273529053, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 300.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24996721744537354, \"percentile_inc_nulls\": 0.2501392960548401, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.24544352293014526, \"percentile_inc_nulls\": 0.245616614818573, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 276.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23895299434661865, \"percentile_inc_nulls\": 0.23912757635116577, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.23551106452941895, \"percentile_inc_nulls\": 0.23568642139434814, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2273159623146057, \"percentile_inc_nulls\": 0.22749322652816772, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 500.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2207762598991394, \"percentile_inc_nulls\": 0.2209550142288208, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 399.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2142857313156128, \"percentile_inc_nulls\": 0.21446597576141357, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 396.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.2109420895576477, \"percentile_inc_nulls\": 0.21112310886383057, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.20648396015167236, \"percentile_inc_nulls\": 0.2066659927368164, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.1981249451637268, \"percentile_inc_nulls\": 0.19830894470214844, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 510.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.19101160764694214, \"percentile_inc_nulls\": 0.1911972165107727, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 434.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.18227559328079224, \"percentile_inc_nulls\": 0.18246322870254517, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 533.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.17460501194000244, \"percentile_inc_nulls\": 0.17479437589645386, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 468.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.16559040546417236, \"percentile_inc_nulls\": 0.16578179597854614, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 550.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.15854257345199585, \"percentile_inc_nulls\": 0.15873563289642334, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 430.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.14836424589157104, \"percentile_inc_nulls\": 0.14855962991714478, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 621.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.13669443130493164, \"percentile_inc_nulls\": 0.13689249753952026, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 712.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.12717169523239136, \"percentile_inc_nulls\": 0.12737196683883667, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 581.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.11497735977172852, \"percentile_inc_nulls\": 0.11518043279647827, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 744.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.09989839792251587, \"percentile_inc_nulls\": 0.10010486841201782, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 920.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0844915509223938, \"percentile_inc_nulls\": 0.08470159769058228, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 940.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.06585592031478882, \"percentile_inc_nulls\": 0.06607019901275635, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1137.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.040352702140808105, \"percentile_inc_nulls\": 0.0405728816986084, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1556.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.00022941827774047852, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2462.0, \"distinct_value_count\": 5121}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 148, \"group_name\": \"_city_\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 14 values (0.0%) are null and there are 5121 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 6759, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1467, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1024, \"group_name\": \"_city_\", \"value\": \"dallas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 854, \"group_name\": \"_city_\", \"value\": \"las vegas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 840, \"group_name\": \"_city_\", \"value\": \"calabasas\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 814, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 786, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 748, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 734, \"group_name\": \"_city_\", \"value\": \"wilmington\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 723, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"waxahachie\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"north lake\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"princeton junction\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"terrebonne\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"effingham\", \"total_non_null_rows\": 61012, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 5121}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 6759]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8530462384223938, \"percentile_inc_nulls\": 0.8530462384223938, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8525546789169312, \"percentile_inc_nulls\": 0.8525546789169312, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8511290550231934, \"percentile_inc_nulls\": 0.8511290550231934, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8492937088012695, \"percentile_inc_nulls\": 0.8492937088012695, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8470815420150757, \"percentile_inc_nulls\": 0.8470815420150757, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8453773856163025, \"percentile_inc_nulls\": 0.8453773856163025, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8408710956573486, \"percentile_inc_nulls\": 0.8408710956573486, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 275.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8389047384262085, \"percentile_inc_nulls\": 0.8389047384262085, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8373971581459045, \"percentile_inc_nulls\": 0.8373971581459045, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8363156318664551, \"percentile_inc_nulls\": 0.8363156318664551, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8345950841903687, \"percentile_inc_nulls\": 0.8345950841903687, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 105.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8319732546806335, \"percentile_inc_nulls\": 0.8319732546806335, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.828859806060791, \"percentile_inc_nulls\": 0.828859806060791, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8262052536010742, \"percentile_inc_nulls\": 0.8262052536010742, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8250909447669983, \"percentile_inc_nulls\": 0.8250909447669983, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 68.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8222069144248962, \"percentile_inc_nulls\": 0.8222069144248962, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 176.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8190115690231323, \"percentile_inc_nulls\": 0.8190115690231323, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 195.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8148821592330933, \"percentile_inc_nulls\": 0.8148821592330933, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 252.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8112607598304749, \"percentile_inc_nulls\": 0.8112607598304749, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8055583238601685, \"percentile_inc_nulls\": 0.8055583238601685, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 348.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8001507520675659, \"percentile_inc_nulls\": 0.8001507520675659, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 330.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7937600612640381, \"percentile_inc_nulls\": 0.7937600612640381, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 390.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7862386703491211, \"percentile_inc_nulls\": 0.7862386703491211, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 459.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7773244380950928, \"percentile_inc_nulls\": 0.7773244380950928, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 544.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.767918586730957, \"percentile_inc_nulls\": 0.767918586730957, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 574.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.754645586013794, \"percentile_inc_nulls\": 0.754645586013794, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 810.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7358830571174622, \"percentile_inc_nulls\": 0.7358830571174622, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 1145.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.7013404369354248, \"percentile_inc_nulls\": 0.7013404369354248, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 2108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.6384655833244324, \"percentile_inc_nulls\": 0.6384655833244324, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 3837.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.4765673875808716, \"percentile_inc_nulls\": 0.4765673875808716, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 9880.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 29083.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9869236350059509, \"percentile_inc_nulls\": 0.9869236350059509, \"value_count\": 798, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 798.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9778782725334167, \"percentile_inc_nulls\": 0.9778782725334167, \"value_count\": 552, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 552.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9721266627311707, \"percentile_inc_nulls\": 0.9721266627311707, \"value_count\": 351, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 351.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9664405584335327, \"percentile_inc_nulls\": 0.9664405584335327, \"value_count\": 347, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 347.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9614754319190979, \"percentile_inc_nulls\": 0.9614754319190979, \"value_count\": 303, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 303.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9569363594055176, \"percentile_inc_nulls\": 0.9569363594055176, \"value_count\": 277, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9525775909423828, \"percentile_inc_nulls\": 0.9525775909423828, \"value_count\": 266, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9486120939254761, \"percentile_inc_nulls\": 0.9486120939254761, \"value_count\": 242, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 242.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9448267817497253, \"percentile_inc_nulls\": 0.9448267817497253, \"value_count\": 231, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9411562085151672, \"percentile_inc_nulls\": 0.9411562085151672, \"value_count\": 224, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 224.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.937616765499115, \"percentile_inc_nulls\": 0.937616765499115, \"value_count\": 216, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 216.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9341428279876709, \"percentile_inc_nulls\": 0.9341428279876709, \"value_count\": 212, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 212.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.930881917476654, \"percentile_inc_nulls\": 0.930881917476654, \"value_count\": 199, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 199.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9276373982429504, \"percentile_inc_nulls\": 0.9276373982429504, \"value_count\": 198, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9246222972869873, \"percentile_inc_nulls\": 0.9246222972869873, \"value_count\": 184, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 184.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9217710494995117, \"percentile_inc_nulls\": 0.9217710494995117, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9162651896476746, \"percentile_inc_nulls\": 0.9162651896476746, \"value_count\": 168, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 336.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9136925339698792, \"percentile_inc_nulls\": 0.9136925339698792, \"value_count\": 157, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 157.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9112181663513184, \"percentile_inc_nulls\": 0.9112181663513184, \"value_count\": 151, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9088093638420105, \"percentile_inc_nulls\": 0.9088093638420105, \"value_count\": 147, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 147.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9065152406692505, \"percentile_inc_nulls\": 0.9065152406692505, \"value_count\": 140, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9042539000511169, \"percentile_inc_nulls\": 0.9042539000511169, \"value_count\": 138, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9021236896514893, \"percentile_inc_nulls\": 0.9021236896514893, \"value_count\": 130, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.9001245498657227, \"percentile_inc_nulls\": 0.9001245498657227, \"value_count\": 122, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 122.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8982564806938171, \"percentile_inc_nulls\": 0.8982564806938171, \"value_count\": 114, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 114.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8964048027992249, \"percentile_inc_nulls\": 0.8964048027992249, \"value_count\": 113, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8946186900138855, \"percentile_inc_nulls\": 0.8946186900138855, \"value_count\": 109, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8929636478424072, \"percentile_inc_nulls\": 0.8929636478424072, \"value_count\": 101, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 101.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8913413882255554, \"percentile_inc_nulls\": 0.8913413882255554, \"value_count\": 99, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8897355198860168, \"percentile_inc_nulls\": 0.8897355198860168, \"value_count\": 98, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8866220712661743, \"percentile_inc_nulls\": 0.8866220712661743, \"value_count\": 95, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 190.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8850981593132019, \"percentile_inc_nulls\": 0.8850981593132019, \"value_count\": 93, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8836561441421509, \"percentile_inc_nulls\": 0.8836561441421509, \"value_count\": 88, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8822305202484131, \"percentile_inc_nulls\": 0.8822305202484131, \"value_count\": 87, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 87.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8810015320777893, \"percentile_inc_nulls\": 0.8810015320777893, \"value_count\": 75, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8798708915710449, \"percentile_inc_nulls\": 0.8798708915710449, \"value_count\": 69, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 69.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.877707839012146, \"percentile_inc_nulls\": 0.877707839012146, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8766427636146545, \"percentile_inc_nulls\": 0.8766427636146545, \"value_count\": 65, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8756431937217712, \"percentile_inc_nulls\": 0.8756431937217712, \"value_count\": 61, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 61.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8746927380561829, \"percentile_inc_nulls\": 0.8746927380561829, \"value_count\": 58, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8737751245498657, \"percentile_inc_nulls\": 0.8737751245498657, \"value_count\": 56, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8728902339935303, \"percentile_inc_nulls\": 0.8728902339935303, \"value_count\": 54, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8711532950401306, \"percentile_inc_nulls\": 0.8711532950401306, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8703175783157349, \"percentile_inc_nulls\": 0.8703175783157349, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8686789274215698, \"percentile_inc_nulls\": 0.8686789274215698, \"value_count\": 50, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8678759932518005, \"percentile_inc_nulls\": 0.8678759932518005, \"value_count\": 49, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 49.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.866401195526123, \"percentile_inc_nulls\": 0.866401195526123, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8656966090202332, \"percentile_inc_nulls\": 0.8656966090202332, \"value_count\": 43, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 43.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8650083541870117, \"percentile_inc_nulls\": 0.8650083541870117, \"value_count\": 42, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8643528819084167, \"percentile_inc_nulls\": 0.8643528819084167, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 40.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.863074779510498, \"percentile_inc_nulls\": 0.863074779510498, \"value_count\": 39, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.861829400062561, \"percentile_inc_nulls\": 0.861829400062561, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8600596189498901, \"percentile_inc_nulls\": 0.8600596189498901, \"value_count\": 36, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8589125871658325, \"percentile_inc_nulls\": 0.8589125871658325, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.857241153717041, \"percentile_inc_nulls\": 0.857241153717041, \"value_count\": 34, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.855618953704834, \"percentile_inc_nulls\": 0.855618953704834, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 0.8545701503753662, \"percentile_inc_nulls\": 0.8545701503753662, \"value_count\": 32, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 64.0, \"distinct_value_count\": 36703}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"sum_tokens_in_value_count_group\": 93.0, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 36703 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 798, \"group_name\": \"_street_address_\", \"value\": \"4500 park granada\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 552, \"group_name\": \"_street_address_\", \"value\": \"711 high st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 351, \"group_name\": \"_street_address_\", \"value\": \"11 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 347, \"group_name\": \"_street_address_\", \"value\": \"383 madison ave\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 303, \"group_name\": \"_street_address_\", \"value\": \"8400 normandale lk blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 277, \"group_name\": \"_street_address_\", \"value\": \"1585 broadway\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 266, \"group_name\": \"_street_address_\", \"value\": \"85 broad st\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 242, \"group_name\": \"_street_address_\", \"value\": \"7485 new horizon way\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 231, \"group_name\": \"_street_address_\", \"value\": \"co wilmington trust company\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 224, \"group_name\": \"_street_address_\", \"value\": \"4ld financial ctr floor 10\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"101 east kennedy blvd\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1700 s pavilion ctr dr\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"256 west 38th st 15th floor\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"3600 wilshire suite 1720\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"8100 ne pkwy dr\", \"total_non_null_rows\": 61026, \"total_rows_inc_nulls\": 61026, \"distinct_value_count\": 36703}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 798]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -577,7 +577,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 19,
    "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7",
    "metadata": {},
    "outputs": [
@@ -586,23 +586,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed {\n",
+       "  #altair-viz-9179de0936c0491181345423b0e45c36.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed details,\n",
-       "  #altair-viz-0a5f2699cc2548b6aab8c96de757f7be.vega-embed details summary {\n",
+       "  #altair-viz-9179de0936c0491181345423b0e45c36.vega-embed details,\n",
+       "  #altair-viz-9179de0936c0491181345423b0e45c36.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\"></div>\n",
+       "<div id=\"altair-viz-9179de0936c0491181345423b0e45c36\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-0a5f2699cc2548b6aab8c96de757f7be\");\n",
+       "    if (outputDiv.id !== \"altair-viz-9179de0936c0491181345423b0e45c36\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-9179de0936c0491181345423b0e45c36\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -648,14 +648,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9991835355758667, \"percentile_inc_nulls\": 0.9991835355758667, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9984630942344666, \"percentile_inc_nulls\": 0.9984630942344666, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9978387355804443, \"percentile_inc_nulls\": 0.9978387355804443, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9972623586654663, \"percentile_inc_nulls\": 0.9972623586654663, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.996734082698822, \"percentile_inc_nulls\": 0.996734082698822, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 11.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9952932000160217, \"percentile_inc_nulls\": 0.9952932000160217, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9944286942481995, \"percentile_inc_nulls\": 0.9944286942481995, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9917390942573547, \"percentile_inc_nulls\": 0.9917390942573547, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9883770942687988, \"percentile_inc_nulls\": 0.9883770942687988, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9765621423721313, \"percentile_inc_nulls\": 0.9765621423721313, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9439027905464172, \"percentile_inc_nulls\": 0.9439027905464172, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.8651361465454102, \"percentile_inc_nulls\": 0.8651361465454102, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1640.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.725949764251709, \"percentile_inc_nulls\": 0.725949764251709, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2898.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.4753373861312866, \"percentile_inc_nulls\": 0.4753373861312866, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5218.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 14086 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 15, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 12, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 11, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"0ham wham8 solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"louisiana energy and power authority\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lost hills solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"long plain solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"los angeles county\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 17]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8766032457351685, \"percentile_inc_nulls\": 0.8794006109237671, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7900633811950684, \"percentile_inc_nulls\": 0.7948225140571594, \"value_count\": 1761, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1761.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7252445220947266, \"percentile_inc_nulls\": 0.7314730286598206, \"value_count\": 1319, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1319.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6654381155967712, \"percentile_inc_nulls\": 0.6730223894119263, \"value_count\": 1217, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1217.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6124134063720703, \"percentile_inc_nulls\": 0.6211997270584106, \"value_count\": 1079, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1079.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5711337327957153, \"percentile_inc_nulls\": 0.5808558464050293, \"value_count\": 840, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5326551795005798, \"percentile_inc_nulls\": 0.5432496070861816, \"value_count\": 783, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.49756747484207153, \"percentile_inc_nulls\": 0.5089572668075562, \"value_count\": 714, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4656739830970764, \"percentile_inc_nulls\": 0.4777868390083313, \"value_count\": 649, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4348616600036621, \"percentile_inc_nulls\": 0.4476730227470398, \"value_count\": 627, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.410388708114624, \"percentile_inc_nulls\": 0.42375487089157104, \"value_count\": 498, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 498.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3862597942352295, \"percentile_inc_nulls\": 0.4001728892326355, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3641948103904724, \"percentile_inc_nulls\": 0.3786081075668335, \"value_count\": 449, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3430144190788269, \"percentile_inc_nulls\": 0.35790789127349854, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3237505555152893, \"percentile_inc_nulls\": 0.3390807509422302, \"value_count\": 392, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.30483072996139526, \"percentile_inc_nulls\": 0.3205897808074951, \"value_count\": 385, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2861074209213257, \"percentile_inc_nulls\": 0.30229097604751587, \"value_count\": 381, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2678264379501343, \"percentile_inc_nulls\": 0.284424364566803, \"value_count\": 372, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2511671185493469, \"percentile_inc_nulls\": 0.2681427597999573, \"value_count\": 339, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.235687255859375, \"percentile_inc_nulls\": 0.2530137896537781, \"value_count\": 315, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2217307686805725, \"percentile_inc_nulls\": 0.23937368392944336, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.20811831951141357, \"percentile_inc_nulls\": 0.22606980800628662, \"value_count\": 277, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.19470244646072388, \"percentile_inc_nulls\": 0.21295809745788574, \"value_count\": 273, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.1813356876373291, \"percentile_inc_nulls\": 0.19989430904388428, \"value_count\": 272, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.15548676252365112, \"percentile_inc_nulls\": 0.17463135719299316, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.14349597692489624, \"percentile_inc_nulls\": 0.16291242837905884, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.13199663162231445, \"percentile_inc_nulls\": 0.1516737937927246, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.12148016691207886, \"percentile_inc_nulls\": 0.1413956880569458, \"value_count\": 214, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.11145508289337158, \"percentile_inc_nulls\": 0.1315978765487671, \"value_count\": 204, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.10162663459777832, \"percentile_inc_nulls\": 0.12199223041534424, \"value_count\": 200, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.09219127893447876, \"percentile_inc_nulls\": 0.11277073621749878, \"value_count\": 192, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.08457416296005249, \"percentile_inc_nulls\": 0.10532635450363159, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07730108499526978, \"percentile_inc_nulls\": 0.09821814298629761, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07012629508972168, \"percentile_inc_nulls\": 0.09120601415634155, \"value_count\": 146, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.06373775005340576, \"percentile_inc_nulls\": 0.08496230840682983, \"value_count\": 130, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.05759495496749878, \"percentile_inc_nulls\": 0.07895874977111816, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.052091002464294434, \"percentile_inc_nulls\": 0.07357954978942871, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0471767783164978, \"percentile_inc_nulls\": 0.06877672672271729, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.042459070682525635, \"percentile_inc_nulls\": 0.06416600942611694, \"value_count\": 96, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0380362868309021, \"percentile_inc_nulls\": 0.05984342098236084, \"value_count\": 90, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.033662617206573486, \"percentile_inc_nulls\": 0.0555688738822937, \"value_count\": 89, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.029436349868774414, \"percentile_inc_nulls\": 0.05143845081329346, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.02555406093597412, \"percentile_inc_nulls\": 0.047644197940826416, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.021720945835113525, \"percentile_inc_nulls\": 0.04389798641204834, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.018526732921600342, \"percentile_inc_nulls\": 0.04077613353729248, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.01572561264038086, \"percentile_inc_nulls\": 0.03803849220275879, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.013071894645690918, \"percentile_inc_nulls\": 0.03544497489929199, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.010467350482940674, \"percentile_inc_nulls\": 0.03289949893951416, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.006142795085906982, \"percentile_inc_nulls\": 0.028672993183135986, \"value_count\": 44, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.004078805446624756, \"percentile_inc_nulls\": 0.02665579319000244, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.002457141876220703, \"percentile_inc_nulls\": 0.02507084608078003, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0012776851654052734, \"percentile_inc_nulls\": 0.02391815185546875, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0004914402961730957, \"percentile_inc_nulls\": 0.023149728775024414, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0003439784049987793, \"percentile_inc_nulls\": 0.02300560474395752, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.00024569034576416016, \"percentile_inc_nulls\": 0.022909581661224365, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.022669434547424316, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 472 values (2.3%) are null and there are 62 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2511, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1761, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1319, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1217, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1079, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 840, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 783, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 714, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 649, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 627, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2511]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9553115367889404, \"percentile_inc_nulls\": 0.9554296135902405, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.9231917858123779, \"percentile_inc_nulls\": 0.9233946800231934, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.89848792552948, \"percentile_inc_nulls\": 0.8987560868263245, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8761436939239502, \"percentile_inc_nulls\": 0.8764708638191223, \"value_count\": 464, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8545699715614319, \"percentile_inc_nulls\": 0.8549541234970093, \"value_count\": 448, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 448.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8359819054603577, \"percentile_inc_nulls\": 0.83641517162323, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8250987529754639, \"percentile_inc_nulls\": 0.8255607485771179, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.815178632736206, \"percentile_inc_nulls\": 0.8156668543815613, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8057401180267334, \"percentile_inc_nulls\": 0.8062533140182495, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7966387271881104, \"percentile_inc_nulls\": 0.797175943851471, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7878744006156921, \"percentile_inc_nulls\": 0.7884347438812256, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7795916199684143, \"percentile_inc_nulls\": 0.7801738977432251, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7714051604270935, \"percentile_inc_nulls\": 0.772009015083313, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7633631825447083, \"percentile_inc_nulls\": 0.7639882564544678, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7555619478225708, \"percentile_inc_nulls\": 0.7562077045440674, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7479052543640137, \"percentile_inc_nulls\": 0.7485711574554443, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7406337261199951, \"percentile_inc_nulls\": 0.7413188219070435, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7338438034057617, \"percentile_inc_nulls\": 0.7345468401908875, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7271501421928406, \"percentile_inc_nulls\": 0.7278709411621094, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7211307287216187, \"percentile_inc_nulls\": 0.7218673229217529, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.715207576751709, \"percentile_inc_nulls\": 0.7159598469734192, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7093807458877563, \"percentile_inc_nulls\": 0.7101483941078186, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7039391398429871, \"percentile_inc_nulls\": 0.704721212387085, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6933448910713196, \"percentile_inc_nulls\": 0.6941549777984619, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6880959272384644, \"percentile_inc_nulls\": 0.6889198422431946, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.677983283996582, \"percentile_inc_nulls\": 0.6788338422775269, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6729750633239746, \"percentile_inc_nulls\": 0.6738389134407043, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6682076454162598, \"percentile_inc_nulls\": 0.6690840721130371, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6634883880615234, \"percentile_inc_nulls\": 0.6643773317337036, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6589136123657227, \"percentile_inc_nulls\": 0.6598145961761475, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6501492857933044, \"percentile_inc_nulls\": 0.6510734558105469, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6459115743637085, \"percentile_inc_nulls\": 0.6468468904495239, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6420109868049622, \"percentile_inc_nulls\": 0.6429566144943237, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6383511424064636, \"percentile_inc_nulls\": 0.6393064260482788, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6313204169273376, \"percentile_inc_nulls\": 0.6322942972183228, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6243860125541687, \"percentile_inc_nulls\": 0.6253782510757446, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6212077140808105, \"percentile_inc_nulls\": 0.6222083568572998, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6180776357650757, \"percentile_inc_nulls\": 0.619086503982544, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6150919795036316, \"percentile_inc_nulls\": 0.616108775138855, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.612250804901123, \"percentile_inc_nulls\": 0.6132750511169434, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6094577312469482, \"percentile_inc_nulls\": 0.6104893684387207, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6041606664657593, \"percentile_inc_nulls\": 0.6052062511444092, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5990561246871948, \"percentile_inc_nulls\": 0.6001152992248535, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5940479636192322, \"percentile_inc_nulls\": 0.5951203107833862, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.591592013835907, \"percentile_inc_nulls\": 0.5926708579063416, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5891842842102051, \"percentile_inc_nulls\": 0.5902694463729858, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5869209170341492, \"percentile_inc_nulls\": 0.5880120992660522, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5847057700157166, \"percentile_inc_nulls\": 0.5858027935028076, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5825387239456177, \"percentile_inc_nulls\": 0.583641529083252, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5783973932266235, \"percentile_inc_nulls\": 0.5795110464096069, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5743522644042969, \"percentile_inc_nulls\": 0.5754766464233398, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.568429172039032, \"percentile_inc_nulls\": 0.5695691704750061, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5626504421234131, \"percentile_inc_nulls\": 0.5638057589530945, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5607724189758301, \"percentile_inc_nulls\": 0.5619326829910278, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5536453723907471, \"percentile_inc_nulls\": 0.5548244714736938, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5519117712974548, \"percentile_inc_nulls\": 0.5530954599380493, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5485408902168274, \"percentile_inc_nulls\": 0.5497334003448486, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.537416934967041, \"percentile_inc_nulls\": 0.5386388301849365, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5327939987182617, \"percentile_inc_nulls\": 0.534028172492981, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5253298282623291, \"percentile_inc_nulls\": 0.526583731174469, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.523885190486908, \"percentile_inc_nulls\": 0.5251429080963135, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5210921764373779, \"percentile_inc_nulls\": 0.5223572254180908, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5156987309455872, \"percentile_inc_nulls\": 0.5169780254364014, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5104979276657104, \"percentile_inc_nulls\": 0.5117909908294678, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5067417621612549, \"percentile_inc_nulls\": 0.5080447196960449, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4995184540748596, \"percentile_inc_nulls\": 0.5008404850959778, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49720698595046997, \"percentile_inc_nulls\": 0.49853515625, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49056148529052734, \"percentile_inc_nulls\": 0.49190717935562134, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4852643609046936, \"percentile_inc_nulls\": 0.4866240620613098, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.47616297006607056, \"percentile_inc_nulls\": 0.47754669189453125, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46845805644989014, \"percentile_inc_nulls\": 0.469862163066864, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46388328075408936, \"percentile_inc_nulls\": 0.46529942750930786, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4595492482185364, \"percentile_inc_nulls\": 0.46097689867019653, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4489068388938904, \"percentile_inc_nulls\": 0.4503626227378845, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.44120198488235474, \"percentile_inc_nulls\": 0.4426780939102173, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.43325626850128174, \"percentile_inc_nulls\": 0.4347533583641052, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.42044687271118164, \"percentile_inc_nulls\": 0.42197781801223755, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4135606288909912, \"percentile_inc_nulls\": 0.4151097536087036, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4025811553001404, \"percentile_inc_nulls\": 0.4041592478752136, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.3919869065284729, \"percentile_inc_nulls\": 0.3935930132865906, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.37465089559555054, \"percentile_inc_nulls\": 0.3763027787208557, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.35991525650024414, \"percentile_inc_nulls\": 0.36160606145858765, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.33757102489471436, \"percentile_inc_nulls\": 0.3393208980560303, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.31869399547576904, \"percentile_inc_nulls\": 0.32049375772476196, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2932678461074829, \"percentile_inc_nulls\": 0.2951347231864929, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.25739187002182007, \"percentile_inc_nulls\": 0.25935351848602295, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 745.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2086583971977234, \"percentile_inc_nulls\": 0.2107487916946411, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.15621691942214966, \"percentile_inc_nulls\": 0.15844577550888062, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.09313303232192993, \"percentile_inc_nulls\": 0.09552854299545288, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1310.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026415586471557617, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1934.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 55 values (0.3%) are null and there are 3879 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 928, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 667, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 513, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 464, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 448, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 386, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 226, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 206, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 196, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 189, \"group_name\": \"_city_\", \"value\": \"nashville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"loma linda\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"combined locks\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"gatlinburg\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"lanai city\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"kissimmee\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 928]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9712309837341309, \"percentile_inc_nulls\": 0.9712309837341309, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.964026689529419, \"percentile_inc_nulls\": 0.964026689529419, \"value_count\": 150, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9597522020339966, \"percentile_inc_nulls\": 0.9597522020339966, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9559579491615295, \"percentile_inc_nulls\": 0.9559579491615295, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9490418434143066, \"percentile_inc_nulls\": 0.9490418434143066, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.942702054977417, \"percentile_inc_nulls\": 0.942702054977417, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9365544319152832, \"percentile_inc_nulls\": 0.9365544319152832, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9335286617279053, \"percentile_inc_nulls\": 0.9335286617279053, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9305508732795715, \"percentile_inc_nulls\": 0.9305508732795715, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9277172088623047, \"percentile_inc_nulls\": 0.9277172088623047, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9251717329025269, \"percentile_inc_nulls\": 0.9251717329025269, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9202728271484375, \"percentile_inc_nulls\": 0.9202728271484375, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9179674386978149, \"percentile_inc_nulls\": 0.9179674386978149, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9111953973770142, \"percentile_inc_nulls\": 0.9111953973770142, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9089860916137695, \"percentile_inc_nulls\": 0.9089860916137695, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9068248271942139, \"percentile_inc_nulls\": 0.9068248271942139, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9047116041183472, \"percentile_inc_nulls\": 0.9047116041183472, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9027424454689026, \"percentile_inc_nulls\": 0.9027424454689026, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8989001512527466, \"percentile_inc_nulls\": 0.8989001512527466, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8915998339653015, \"percentile_inc_nulls\": 0.8915998339653015, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.886268675327301, \"percentile_inc_nulls\": 0.886268675327301, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8829066753387451, \"percentile_inc_nulls\": 0.8829066753387451, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8813217282295227, \"percentile_inc_nulls\": 0.8813217282295227, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8738773465156555, \"percentile_inc_nulls\": 0.8738773465156555, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8709956407546997, \"percentile_inc_nulls\": 0.8709956407546997, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8696027994155884, \"percentile_inc_nulls\": 0.8696027994155884, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8628787994384766, \"percentile_inc_nulls\": 0.8628787994384766, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8563950061798096, \"percentile_inc_nulls\": 0.8563950061798096, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8526487350463867, \"percentile_inc_nulls\": 0.8526487350463867, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8490466475486755, \"percentile_inc_nulls\": 0.8490466475486755, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8455885648727417, \"percentile_inc_nulls\": 0.8455885648727417, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8411699533462524, \"percentile_inc_nulls\": 0.8411699533462524, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8401133418083191, \"percentile_inc_nulls\": 0.8401133418083191, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8380961418151855, \"percentile_inc_nulls\": 0.8380961418151855, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8332933187484741, \"percentile_inc_nulls\": 0.8332933187484741, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.828730583190918, \"percentile_inc_nulls\": 0.828730583190918, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8218145370483398, \"percentile_inc_nulls\": 0.8218145370483398, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8136496543884277, \"percentile_inc_nulls\": 0.8136496543884277, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8082705140113831, \"percentile_inc_nulls\": 0.8082705140113831, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.800345778465271, \"percentile_inc_nulls\": 0.800345778465271, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7956390380859375, \"percentile_inc_nulls\": 0.7956390380859375, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7844003438949585, \"percentile_inc_nulls\": 0.7844003438949585, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7728735208511353, \"percentile_inc_nulls\": 0.7728735208511353, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7633638978004456, \"percentile_inc_nulls\": 0.7633638978004456, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7494356632232666, \"percentile_inc_nulls\": 0.7494356632232666, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7351711988449097, \"percentile_inc_nulls\": 0.7351711988449097, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7171125411987305, \"percentile_inc_nulls\": 0.7171125411987305, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6966043710708618, \"percentile_inc_nulls\": 0.6966043710708618, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6680755019187927, \"percentile_inc_nulls\": 0.6680755019187927, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6394985914230347, \"percentile_inc_nulls\": 0.6394985914230347, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6006916165351868, \"percentile_inc_nulls\": 0.6006916165351868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 808.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.548388659954071, \"percentile_inc_nulls\": 0.548388659954071, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.43811535835266113, \"percentile_inc_nulls\": 0.43811535835266113, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2296.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9122.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 11403 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 425, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 174, \"group_name\": \"_street_address_\", \"value\": \"130 roberts st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 150, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st suite 200\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 89, \"group_name\": \"_street_address_\", \"value\": \"333 washington st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave 35th fl\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"222 2nd ave south suite 1900\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"101 summer st 2nd floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"50101 governors dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 64, \"group_name\": \"_street_address_\", \"value\": \"9405 arrowpoint blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"13915 kimberly\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"220 w main sreet\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2404 15th streetpo box 988\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"po box 2000\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"1255 23rd st nw ste 300\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 425]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"vconcat\": [{\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9991835355758667, \"percentile_inc_nulls\": 0.9991835355758667, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9984630942344666, \"percentile_inc_nulls\": 0.9984630942344666, \"value_count\": 15, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 15.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9978387355804443, \"percentile_inc_nulls\": 0.9978387355804443, \"value_count\": 13, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 13.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9972623586654663, \"percentile_inc_nulls\": 0.9972623586654663, \"value_count\": 12, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 12.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.996734082698822, \"percentile_inc_nulls\": 0.996734082698822, \"value_count\": 11, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 11.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9952932000160217, \"percentile_inc_nulls\": 0.9952932000160217, \"value_count\": 10, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9944286942481995, \"percentile_inc_nulls\": 0.9944286942481995, \"value_count\": 9, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 18.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9917390942573547, \"percentile_inc_nulls\": 0.9917390942573547, \"value_count\": 8, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 56.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9883770942687988, \"percentile_inc_nulls\": 0.9883770942687988, \"value_count\": 7, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9765621423721313, \"percentile_inc_nulls\": 0.9765621423721313, \"value_count\": 6, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 246.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.9439027905464172, \"percentile_inc_nulls\": 0.9439027905464172, \"value_count\": 5, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 680.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.8651361465454102, \"percentile_inc_nulls\": 0.8651361465454102, \"value_count\": 4, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1640.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.725949764251709, \"percentile_inc_nulls\": 0.725949764251709, \"value_count\": 3, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2898.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.4753373861312866, \"percentile_inc_nulls\": 0.4753373861312866, \"value_count\": 2, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5218.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9897.0, \"distinct_value_count\": 14086}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 17, \"group_name\": \"_company_name_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 17.0, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"company_name\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 14086 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 17, \"group_name\": \"_company_name_\", \"value\": \"calpine corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 15, \"group_name\": \"_company_name_\", \"value\": \"georgia pacific corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 13, \"group_name\": \"_company_name_\", \"value\": \"weyerhaeuser company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 12, \"group_name\": \"_company_name_\", \"value\": \"calpine eastern corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 11, \"group_name\": \"_company_name_\", \"value\": \"calpine operating services company incorporated\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"springfield city of\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"international paper company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 10, \"group_name\": \"_company_name_\", \"value\": \"dow chemical company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"newpage corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 9, \"group_name\": \"_company_name_\", \"value\": \"smurfit stone container corporation\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"0ham wham8 solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"louisiana energy and power authority\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"longwing solar limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lse lepus limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}, {\"value_count\": 1, \"group_name\": \"_company_name_\", \"value\": \"lse pegasus limited liability company\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 14086}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 17]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.8766032457351685, \"percentile_inc_nulls\": 0.8794006109237671, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7900633811950684, \"percentile_inc_nulls\": 0.7948225140571594, \"value_count\": 1761, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1761.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.7252445220947266, \"percentile_inc_nulls\": 0.7314730286598206, \"value_count\": 1319, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1319.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6654381155967712, \"percentile_inc_nulls\": 0.6730223894119263, \"value_count\": 1217, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1217.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.6124134063720703, \"percentile_inc_nulls\": 0.6211997270584106, \"value_count\": 1079, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1079.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5711337327957153, \"percentile_inc_nulls\": 0.5808558464050293, \"value_count\": 840, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 840.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.5326551795005798, \"percentile_inc_nulls\": 0.5432496070861816, \"value_count\": 783, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 783.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.49756747484207153, \"percentile_inc_nulls\": 0.5089572668075562, \"value_count\": 714, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 714.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4656739830970764, \"percentile_inc_nulls\": 0.4777868390083313, \"value_count\": 649, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 649.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.4348616600036621, \"percentile_inc_nulls\": 0.4476730227470398, \"value_count\": 627, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 627.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.410388708114624, \"percentile_inc_nulls\": 0.42375487089157104, \"value_count\": 498, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 498.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3862597942352295, \"percentile_inc_nulls\": 0.4001728892326355, \"value_count\": 491, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 491.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3641948103904724, \"percentile_inc_nulls\": 0.3786081075668335, \"value_count\": 449, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 449.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3430144190788269, \"percentile_inc_nulls\": 0.35790789127349854, \"value_count\": 431, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 431.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.3237505555152893, \"percentile_inc_nulls\": 0.3390807509422302, \"value_count\": 392, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.30483072996139526, \"percentile_inc_nulls\": 0.3205897808074951, \"value_count\": 385, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 385.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2861074209213257, \"percentile_inc_nulls\": 0.30229097604751587, \"value_count\": 381, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 381.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2678264379501343, \"percentile_inc_nulls\": 0.284424364566803, \"value_count\": 372, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 372.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2511671185493469, \"percentile_inc_nulls\": 0.2681427597999573, \"value_count\": 339, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 339.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.235687255859375, \"percentile_inc_nulls\": 0.2530137896537781, \"value_count\": 315, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 315.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.2217307686805725, \"percentile_inc_nulls\": 0.23937368392944336, \"value_count\": 284, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 284.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.20811831951141357, \"percentile_inc_nulls\": 0.22606980800628662, \"value_count\": 277, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 277.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.19470244646072388, \"percentile_inc_nulls\": 0.21295809745788574, \"value_count\": 273, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 273.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.1813356876373291, \"percentile_inc_nulls\": 0.19989430904388428, \"value_count\": 272, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 272.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.15548676252365112, \"percentile_inc_nulls\": 0.17463135719299316, \"value_count\": 263, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 526.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.14349597692489624, \"percentile_inc_nulls\": 0.16291242837905884, \"value_count\": 244, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 244.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.13199663162231445, \"percentile_inc_nulls\": 0.1516737937927246, \"value_count\": 234, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.12148016691207886, \"percentile_inc_nulls\": 0.1413956880569458, \"value_count\": 214, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 214.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.11145508289337158, \"percentile_inc_nulls\": 0.1315978765487671, \"value_count\": 204, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 204.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.10162663459777832, \"percentile_inc_nulls\": 0.12199223041534424, \"value_count\": 200, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 200.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.09219127893447876, \"percentile_inc_nulls\": 0.11277073621749878, \"value_count\": 192, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 192.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.08457416296005249, \"percentile_inc_nulls\": 0.10532635450363159, \"value_count\": 155, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07730108499526978, \"percentile_inc_nulls\": 0.09821814298629761, \"value_count\": 148, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.07012629508972168, \"percentile_inc_nulls\": 0.09120601415634155, \"value_count\": 146, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.06373775005340576, \"percentile_inc_nulls\": 0.08496230840682983, \"value_count\": 130, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 130.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.05759495496749878, \"percentile_inc_nulls\": 0.07895874977111816, \"value_count\": 125, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.052091002464294434, \"percentile_inc_nulls\": 0.07357954978942871, \"value_count\": 112, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0471767783164978, \"percentile_inc_nulls\": 0.06877672672271729, \"value_count\": 100, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.042459070682525635, \"percentile_inc_nulls\": 0.06416600942611694, \"value_count\": 96, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0380362868309021, \"percentile_inc_nulls\": 0.05984342098236084, \"value_count\": 90, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.033662617206573486, \"percentile_inc_nulls\": 0.0555688738822937, \"value_count\": 89, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.029436349868774414, \"percentile_inc_nulls\": 0.05143845081329346, \"value_count\": 86, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.02555406093597412, \"percentile_inc_nulls\": 0.047644197940826416, \"value_count\": 79, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.021720945835113525, \"percentile_inc_nulls\": 0.04389798641204834, \"value_count\": 78, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.018526732921600342, \"percentile_inc_nulls\": 0.04077613353729248, \"value_count\": 65, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.01572561264038086, \"percentile_inc_nulls\": 0.03803849220275879, \"value_count\": 57, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 57.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.013071894645690918, \"percentile_inc_nulls\": 0.03544497489929199, \"value_count\": 54, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 54.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.010467350482940674, \"percentile_inc_nulls\": 0.03289949893951416, \"value_count\": 53, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.006142795085906982, \"percentile_inc_nulls\": 0.028672993183135986, \"value_count\": 44, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.004078805446624756, \"percentile_inc_nulls\": 0.02665579319000244, \"value_count\": 42, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.002457141876220703, \"percentile_inc_nulls\": 0.02507084608078003, \"value_count\": 33, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0012776851654052734, \"percentile_inc_nulls\": 0.02391815185546875, \"value_count\": 24, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 24.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0004914402961730957, \"percentile_inc_nulls\": 0.023149728775024414, \"value_count\": 16, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 16.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0003439784049987793, \"percentile_inc_nulls\": 0.02300560474395752, \"value_count\": 3, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 3.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.00024569034576416016, \"percentile_inc_nulls\": 0.022909581661224365, \"value_count\": 2, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.022669434547424316, \"value_count\": 1, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 5.0, \"distinct_value_count\": 62}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 2511, \"group_name\": \"_state_\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2511.0, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"state\\\"\", \"subtitle\": \"In this col, 472 values (2.3%) are null and there are 62 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 2511, \"group_name\": \"_state_\", \"value\": \"ca\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1761, \"group_name\": \"_state_\", \"value\": \"tx\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1319, \"group_name\": \"_state_\", \"value\": \"ny\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1217, \"group_name\": \"_state_\", \"value\": \"nc\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1079, \"group_name\": \"_state_\", \"value\": \"ma\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 840, \"group_name\": \"_state_\", \"value\": \"fl\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 783, \"group_name\": \"_state_\", \"value\": \"mn\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 714, \"group_name\": \"_state_\", \"value\": \"nj\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 649, \"group_name\": \"_state_\", \"value\": \"pa\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 627, \"group_name\": \"_state_\", \"value\": \"il\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"8a\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"as\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"uk\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"gu\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}, {\"value_count\": 1, \"group_name\": \"_state_\", \"value\": \"pr\", \"total_non_null_rows\": 20349, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 62}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 2511]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9553115367889404, \"percentile_inc_nulls\": 0.9554296135902405, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.9231917858123779, \"percentile_inc_nulls\": 0.9233946800231934, \"value_count\": 667, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 667.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.89848792552948, \"percentile_inc_nulls\": 0.8987560868263245, \"value_count\": 513, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 513.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8761436939239502, \"percentile_inc_nulls\": 0.8764708638191223, \"value_count\": 464, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8545699715614319, \"percentile_inc_nulls\": 0.8549541234970093, \"value_count\": 448, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 448.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8359819054603577, \"percentile_inc_nulls\": 0.83641517162323, \"value_count\": 386, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 386.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8250987529754639, \"percentile_inc_nulls\": 0.8255607485771179, \"value_count\": 226, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 226.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.815178632736206, \"percentile_inc_nulls\": 0.8156668543815613, \"value_count\": 206, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 206.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.8057401180267334, \"percentile_inc_nulls\": 0.8062533140182495, \"value_count\": 196, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 196.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7966387271881104, \"percentile_inc_nulls\": 0.797175943851471, \"value_count\": 189, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7878744006156921, \"percentile_inc_nulls\": 0.7884347438812256, \"value_count\": 182, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7795916199684143, \"percentile_inc_nulls\": 0.7801738977432251, \"value_count\": 172, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 172.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7714051604270935, \"percentile_inc_nulls\": 0.772009015083313, \"value_count\": 170, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7633631825447083, \"percentile_inc_nulls\": 0.7639882564544678, \"value_count\": 167, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 167.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7555619478225708, \"percentile_inc_nulls\": 0.7562077045440674, \"value_count\": 162, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 162.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7479052543640137, \"percentile_inc_nulls\": 0.7485711574554443, \"value_count\": 159, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 159.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7406337261199951, \"percentile_inc_nulls\": 0.7413188219070435, \"value_count\": 151, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 151.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7338438034057617, \"percentile_inc_nulls\": 0.7345468401908875, \"value_count\": 141, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7271501421928406, \"percentile_inc_nulls\": 0.7278709411621094, \"value_count\": 139, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 139.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7211307287216187, \"percentile_inc_nulls\": 0.7218673229217529, \"value_count\": 125, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 125.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.715207576751709, \"percentile_inc_nulls\": 0.7159598469734192, \"value_count\": 123, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7093807458877563, \"percentile_inc_nulls\": 0.7101483941078186, \"value_count\": 121, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 121.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.7039391398429871, \"percentile_inc_nulls\": 0.704721212387085, \"value_count\": 113, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 113.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6933448910713196, \"percentile_inc_nulls\": 0.6941549777984619, \"value_count\": 110, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6880959272384644, \"percentile_inc_nulls\": 0.6889198422431946, \"value_count\": 109, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 109.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.677983283996582, \"percentile_inc_nulls\": 0.6788338422775269, \"value_count\": 105, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 210.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6729750633239746, \"percentile_inc_nulls\": 0.6738389134407043, \"value_count\": 104, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6682076454162598, \"percentile_inc_nulls\": 0.6690840721130371, \"value_count\": 99, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 99.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6634883880615234, \"percentile_inc_nulls\": 0.6643773317337036, \"value_count\": 98, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6589136123657227, \"percentile_inc_nulls\": 0.6598145961761475, \"value_count\": 95, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6501492857933044, \"percentile_inc_nulls\": 0.6510734558105469, \"value_count\": 91, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 182.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6459115743637085, \"percentile_inc_nulls\": 0.6468468904495239, \"value_count\": 88, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 88.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6420109868049622, \"percentile_inc_nulls\": 0.6429566144943237, \"value_count\": 81, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 81.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6383511424064636, \"percentile_inc_nulls\": 0.6393064260482788, \"value_count\": 76, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 76.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6313204169273376, \"percentile_inc_nulls\": 0.6322942972183228, \"value_count\": 73, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 146.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6243860125541687, \"percentile_inc_nulls\": 0.6253782510757446, \"value_count\": 72, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6212077140808105, \"percentile_inc_nulls\": 0.6222083568572998, \"value_count\": 66, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 66.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6180776357650757, \"percentile_inc_nulls\": 0.619086503982544, \"value_count\": 65, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 65.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6150919795036316, \"percentile_inc_nulls\": 0.616108775138855, \"value_count\": 62, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.612250804901123, \"percentile_inc_nulls\": 0.6132750511169434, \"value_count\": 59, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6094577312469482, \"percentile_inc_nulls\": 0.6104893684387207, \"value_count\": 58, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.6041606664657593, \"percentile_inc_nulls\": 0.6052062511444092, \"value_count\": 55, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5990561246871948, \"percentile_inc_nulls\": 0.6001152992248535, \"value_count\": 53, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 106.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5940479636192322, \"percentile_inc_nulls\": 0.5951203107833862, \"value_count\": 52, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 104.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.591592013835907, \"percentile_inc_nulls\": 0.5926708579063416, \"value_count\": 51, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 51.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5891842842102051, \"percentile_inc_nulls\": 0.5902694463729858, \"value_count\": 50, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 50.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5869209170341492, \"percentile_inc_nulls\": 0.5880120992660522, \"value_count\": 47, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 47.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5847057700157166, \"percentile_inc_nulls\": 0.5858027935028076, \"value_count\": 46, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5825387239456177, \"percentile_inc_nulls\": 0.583641529083252, \"value_count\": 45, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5783973932266235, \"percentile_inc_nulls\": 0.5795110464096069, \"value_count\": 43, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 86.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5743522644042969, \"percentile_inc_nulls\": 0.5754766464233398, \"value_count\": 42, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 84.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.568429172039032, \"percentile_inc_nulls\": 0.5695691704750061, \"value_count\": 41, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 123.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5626504421234131, \"percentile_inc_nulls\": 0.5638057589530945, \"value_count\": 40, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 120.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5607724189758301, \"percentile_inc_nulls\": 0.5619326829910278, \"value_count\": 39, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 39.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5536453723907471, \"percentile_inc_nulls\": 0.5548244714736938, \"value_count\": 37, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 148.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5519117712974548, \"percentile_inc_nulls\": 0.5530954599380493, \"value_count\": 36, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 36.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5485408902168274, \"percentile_inc_nulls\": 0.5497334003448486, \"value_count\": 35, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.537416934967041, \"percentile_inc_nulls\": 0.5386388301849365, \"value_count\": 33, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 231.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5327939987182617, \"percentile_inc_nulls\": 0.534028172492981, \"value_count\": 32, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 96.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5253298282623291, \"percentile_inc_nulls\": 0.526583731174469, \"value_count\": 31, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.523885190486908, \"percentile_inc_nulls\": 0.5251429080963135, \"value_count\": 30, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 30.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5210921764373779, \"percentile_inc_nulls\": 0.5223572254180908, \"value_count\": 29, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 58.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5156987309455872, \"percentile_inc_nulls\": 0.5169780254364014, \"value_count\": 28, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5104979276657104, \"percentile_inc_nulls\": 0.5117909908294678, \"value_count\": 27, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 108.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.5067417621612549, \"percentile_inc_nulls\": 0.5080447196960449, \"value_count\": 26, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4995184540748596, \"percentile_inc_nulls\": 0.5008404850959778, \"value_count\": 25, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49720698595046997, \"percentile_inc_nulls\": 0.49853515625, \"value_count\": 24, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.49056148529052734, \"percentile_inc_nulls\": 0.49190717935562134, \"value_count\": 23, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 138.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4852643609046936, \"percentile_inc_nulls\": 0.4866240620613098, \"value_count\": 22, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 110.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.47616297006607056, \"percentile_inc_nulls\": 0.47754669189453125, \"value_count\": 21, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 189.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46845805644989014, \"percentile_inc_nulls\": 0.469862163066864, \"value_count\": 20, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.46388328075408936, \"percentile_inc_nulls\": 0.46529942750930786, \"value_count\": 19, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4595492482185364, \"percentile_inc_nulls\": 0.46097689867019653, \"value_count\": 18, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 90.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4489068388938904, \"percentile_inc_nulls\": 0.4503626227378845, \"value_count\": 17, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 221.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.44120198488235474, \"percentile_inc_nulls\": 0.4426780939102173, \"value_count\": 16, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 160.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.43325626850128174, \"percentile_inc_nulls\": 0.4347533583641052, \"value_count\": 15, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.42044687271118164, \"percentile_inc_nulls\": 0.42197781801223755, \"value_count\": 14, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 266.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4135606288909912, \"percentile_inc_nulls\": 0.4151097536087036, \"value_count\": 13, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 143.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.4025811553001404, \"percentile_inc_nulls\": 0.4041592478752136, \"value_count\": 12, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 228.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.3919869065284729, \"percentile_inc_nulls\": 0.3935930132865906, \"value_count\": 11, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 220.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.37465089559555054, \"percentile_inc_nulls\": 0.3763027787208557, \"value_count\": 10, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 360.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.35991525650024414, \"percentile_inc_nulls\": 0.36160606145858765, \"value_count\": 9, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 306.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.33757102489471436, \"percentile_inc_nulls\": 0.3393208980560303, \"value_count\": 8, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 464.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.31869399547576904, \"percentile_inc_nulls\": 0.32049375772476196, \"value_count\": 7, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 392.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2932678461074829, \"percentile_inc_nulls\": 0.2951347231864929, \"value_count\": 6, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 528.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.25739187002182007, \"percentile_inc_nulls\": 0.25935351848602295, \"value_count\": 5, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 745.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.2086583971977234, \"percentile_inc_nulls\": 0.2107487916946411, \"value_count\": 4, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1012.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.15621691942214966, \"percentile_inc_nulls\": 0.15844577550888062, \"value_count\": 3, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.09313303232192993, \"percentile_inc_nulls\": 0.09552854299545288, \"value_count\": 2, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1310.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0026415586471557617, \"value_count\": 1, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1934.0, \"distinct_value_count\": 3879}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 928, \"group_name\": \"_city_\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 928.0, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"city\\\"\", \"subtitle\": \"In this col, 55 values (0.3%) are null and there are 3879 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 928, \"group_name\": \"_city_\", \"value\": \"houston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 667, \"group_name\": \"_city_\", \"value\": \"new york\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 513, \"group_name\": \"_city_\", \"value\": \"san francisco\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 464, \"group_name\": \"_city_\", \"value\": \"juno beach\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 448, \"group_name\": \"_city_\", \"value\": \"boston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 386, \"group_name\": \"_city_\", \"value\": \"charlotte\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 226, \"group_name\": \"_city_\", \"value\": \"chicago\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 206, \"group_name\": \"_city_\", \"value\": \"san diego\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 196, \"group_name\": \"_city_\", \"value\": \"andover\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 189, \"group_name\": \"_city_\", \"value\": \"nashville\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"marengo twp\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"oldsmar\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"uniondale\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"carey\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}, {\"value_count\": 1, \"group_name\": \"_city_\", \"value\": \"williamston\", \"total_non_null_rows\": 20766, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 3879}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 928]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}, {\"hconcat\": [{\"mark\": {\"type\": \"line\", \"interpolate\": \"step-before\"}, \"data\": {\"values\": [{\"percentile_ex_nulls\": 0.9795879125595093, \"percentile_inc_nulls\": 0.9795879125595093, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9712309837341309, \"percentile_inc_nulls\": 0.9712309837341309, \"value_count\": 174, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 174.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.964026689529419, \"percentile_inc_nulls\": 0.964026689529419, \"value_count\": 150, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 150.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9597522020339966, \"percentile_inc_nulls\": 0.9597522020339966, \"value_count\": 89, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 89.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9559579491615295, \"percentile_inc_nulls\": 0.9559579491615295, \"value_count\": 79, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 79.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9490418434143066, \"percentile_inc_nulls\": 0.9490418434143066, \"value_count\": 72, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.942702054977417, \"percentile_inc_nulls\": 0.942702054977417, \"value_count\": 66, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 132.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9365544319152832, \"percentile_inc_nulls\": 0.9365544319152832, \"value_count\": 64, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 128.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9335286617279053, \"percentile_inc_nulls\": 0.9335286617279053, \"value_count\": 63, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 63.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9305508732795715, \"percentile_inc_nulls\": 0.9305508732795715, \"value_count\": 62, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 62.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9277172088623047, \"percentile_inc_nulls\": 0.9277172088623047, \"value_count\": 59, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 59.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9251717329025269, \"percentile_inc_nulls\": 0.9251717329025269, \"value_count\": 53, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 53.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9202728271484375, \"percentile_inc_nulls\": 0.9202728271484375, \"value_count\": 51, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 102.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9179674386978149, \"percentile_inc_nulls\": 0.9179674386978149, \"value_count\": 48, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 48.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9111953973770142, \"percentile_inc_nulls\": 0.9111953973770142, \"value_count\": 47, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 141.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9089860916137695, \"percentile_inc_nulls\": 0.9089860916137695, \"value_count\": 46, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 46.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9068248271942139, \"percentile_inc_nulls\": 0.9068248271942139, \"value_count\": 45, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 45.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9047116041183472, \"percentile_inc_nulls\": 0.9047116041183472, \"value_count\": 44, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 44.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.9027424454689026, \"percentile_inc_nulls\": 0.9027424454689026, \"value_count\": 41, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 41.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8989001512527466, \"percentile_inc_nulls\": 0.8989001512527466, \"value_count\": 40, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 80.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8915998339653015, \"percentile_inc_nulls\": 0.8915998339653015, \"value_count\": 38, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 152.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.886268675327301, \"percentile_inc_nulls\": 0.886268675327301, \"value_count\": 37, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 111.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8829066753387451, \"percentile_inc_nulls\": 0.8829066753387451, \"value_count\": 35, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 70.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8813217282295227, \"percentile_inc_nulls\": 0.8813217282295227, \"value_count\": 33, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 33.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8738773465156555, \"percentile_inc_nulls\": 0.8738773465156555, \"value_count\": 31, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 155.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8709956407546997, \"percentile_inc_nulls\": 0.8709956407546997, \"value_count\": 30, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 60.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8696027994155884, \"percentile_inc_nulls\": 0.8696027994155884, \"value_count\": 29, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 29.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8628787994384766, \"percentile_inc_nulls\": 0.8628787994384766, \"value_count\": 28, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 140.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8563950061798096, \"percentile_inc_nulls\": 0.8563950061798096, \"value_count\": 27, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 135.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8526487350463867, \"percentile_inc_nulls\": 0.8526487350463867, \"value_count\": 26, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 78.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8490466475486755, \"percentile_inc_nulls\": 0.8490466475486755, \"value_count\": 25, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 75.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8455885648727417, \"percentile_inc_nulls\": 0.8455885648727417, \"value_count\": 24, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 72.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8411699533462524, \"percentile_inc_nulls\": 0.8411699533462524, \"value_count\": 23, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 92.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8401133418083191, \"percentile_inc_nulls\": 0.8401133418083191, \"value_count\": 22, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 22.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8380961418151855, \"percentile_inc_nulls\": 0.8380961418151855, \"value_count\": 21, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 42.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8332933187484741, \"percentile_inc_nulls\": 0.8332933187484741, \"value_count\": 20, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 100.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.828730583190918, \"percentile_inc_nulls\": 0.828730583190918, \"value_count\": 19, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 95.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8218145370483398, \"percentile_inc_nulls\": 0.8218145370483398, \"value_count\": 18, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 144.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8136496543884277, \"percentile_inc_nulls\": 0.8136496543884277, \"value_count\": 17, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 170.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.8082705140113831, \"percentile_inc_nulls\": 0.8082705140113831, \"value_count\": 16, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 112.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.800345778465271, \"percentile_inc_nulls\": 0.800345778465271, \"value_count\": 15, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 165.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7956390380859375, \"percentile_inc_nulls\": 0.7956390380859375, \"value_count\": 14, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 98.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7844003438949585, \"percentile_inc_nulls\": 0.7844003438949585, \"value_count\": 13, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 234.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7728735208511353, \"percentile_inc_nulls\": 0.7728735208511353, \"value_count\": 12, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 240.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7633638978004456, \"percentile_inc_nulls\": 0.7633638978004456, \"value_count\": 11, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 198.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7494356632232666, \"percentile_inc_nulls\": 0.7494356632232666, \"value_count\": 10, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 290.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7351711988449097, \"percentile_inc_nulls\": 0.7351711988449097, \"value_count\": 9, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 297.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.7171125411987305, \"percentile_inc_nulls\": 0.7171125411987305, \"value_count\": 8, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 376.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6966043710708618, \"percentile_inc_nulls\": 0.6966043710708618, \"value_count\": 7, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 427.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6680755019187927, \"percentile_inc_nulls\": 0.6680755019187927, \"value_count\": 6, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 594.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6394985914230347, \"percentile_inc_nulls\": 0.6394985914230347, \"value_count\": 5, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 595.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.6006916165351868, \"percentile_inc_nulls\": 0.6006916165351868, \"value_count\": 4, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 808.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.548388659954071, \"percentile_inc_nulls\": 0.548388659954071, \"value_count\": 3, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 1089.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.43811535835266113, \"percentile_inc_nulls\": 0.43811535835266113, \"value_count\": 2, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 2296.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 0.0, \"percentile_inc_nulls\": 0.0, \"value_count\": 1, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 9122.0, \"distinct_value_count\": 11403}, {\"percentile_ex_nulls\": 1.0, \"percentile_inc_nulls\": 1.0, \"value_count\": 425, \"group_name\": \"_street_address_\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"sum_tokens_in_value_count_group\": 425.0, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"percentile_ex_nulls\", \"type\": \"quantitative\"}, {\"field\": \"percentile_inc_nulls\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"percentile_ex_nulls\", \"sort\": \"descending\", \"title\": \"Percentile\", \"type\": \"quantitative\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Count of values\", \"type\": \"quantitative\"}}, \"title\": {\"text\": \"Distribution of counts of values in column \\\"street_address\\\"\", \"subtitle\": \"In this col, 0 values (0.0%) are null and there are 11403 distinct values\"}}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 425, \"group_name\": \"_street_address_\", \"value\": \"700 universe blvd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 174, \"group_name\": \"_street_address_\", \"value\": \"130 roberts st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 150, \"group_name\": \"_street_address_\", \"value\": \"800 taylor st suite 200\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 89, \"group_name\": \"_street_address_\", \"value\": \"333 washington st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 79, \"group_name\": \"_street_address_\", \"value\": \"1519 king st\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"575 fifth ave 35th fl\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 72, \"group_name\": \"_street_address_\", \"value\": \"222 2nd ave south suite 1900\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"50101 governors dr\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 66, \"group_name\": \"_street_address_\", \"value\": \"101 summer st 2nd floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 64, \"group_name\": \"_street_address_\", \"value\": \"66 york st 5th floor\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Top 10 values by value count\"}, {\"mark\": \"bar\", \"data\": {\"values\": [{\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"p o box 4998 1955 workman ml rd\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"13915 kimberly\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"357 6th ave w\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"40mtower ln suite 201\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}, {\"value_count\": 1, \"group_name\": \"_street_address_\", \"value\": \"2160 s first ave\", \"total_non_null_rows\": 20821, \"total_rows_inc_nulls\": 20821, \"distinct_value_count\": 11403}]}, \"encoding\": {\"tooltip\": [{\"field\": \"value\", \"type\": \"nominal\"}, {\"field\": \"value_count\", \"type\": \"quantitative\"}, {\"field\": \"total_non_null_rows\", \"type\": \"quantitative\"}, {\"field\": \"total_rows_inc_nulls\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"value\", \"sort\": \"-y\", \"title\": null, \"type\": \"nominal\"}, \"y\": {\"field\": \"value_count\", \"scale\": {\"domain\": [0, 425]}, \"title\": \"Value count\", \"type\": \"quantitative\"}}, \"title\": \"Bottom 5 values by value count\"}]}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\"}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -674,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 20,
    "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237",
    "metadata": {},
    "outputs": [
@@ -688,7 +688,7 @@
        " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -708,7 +708,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 21,
    "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3",
    "metadata": {},
    "outputs": [
@@ -772,7 +772,7 @@
        "2  FRST      816       36        29376"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -791,7 +791,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 22,
    "id": "4e1a9844-5d98-4cac-a083-eef134f083ce",
    "metadata": {},
    "outputs": [
@@ -800,23 +800,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed {\n",
+       "  #altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed details,\n",
-       "  #altair-viz-992b4c7852b74b80835e3a88352b4008.vega-embed details summary {\n",
+       "  #altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa.vega-embed details,\n",
+       "  #altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-992b4c7852b74b80835e3a88352b4008\"></div>\n",
+       "<div id=\"altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-992b4c7852b74b80835e3a88352b4008\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-992b4c7852b74b80835e3a88352b4008\");\n",
+       "    if (outputDiv.id !== \"altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-cb04ee5d2ffe4decac9ccc71d3d339fa\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -869,7 +869,7 @@
        "alt.Chart(...)"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -894,7 +894,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 23,
    "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18",
    "metadata": {},
    "outputs": [
@@ -918,7 +918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 24,
    "id": "4298a288-c306-4d75-9d72-e5b8f87774ce",
    "metadata": {},
    "outputs": [
@@ -942,7 +942,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 25,
    "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794",
    "metadata": {},
    "outputs": [
@@ -965,7 +965,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 26,
    "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6",
    "metadata": {},
    "outputs": [
@@ -989,7 +989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 27,
    "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407",
    "metadata": {},
    "outputs": [],
@@ -1012,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 28,
    "id": "36cae876-783d-4bff-89df-9d30cc5e60d6",
    "metadata": {},
    "outputs": [
@@ -1031,7 +1031,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 29,
    "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3",
    "metadata": {},
    "outputs": [
@@ -1045,7 +1045,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c4bcd9c2605a413aab003a2484a4a006",
+       "model_id": "2d93ffa99ee34fd08af14f8ed7c2731e",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1059,7 +1059,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b15bb7a15e37447ba1366278db3ab2bd",
+       "model_id": "776e605976fa4d8b9a31f2117543eda7",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1091,7 +1091,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 30,
    "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f",
    "metadata": {},
    "outputs": [
@@ -1122,7 +1122,7 @@
       "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n",
       "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n",
       "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n",
-      "Iteration 4: Largest change in params was 0.000535 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.000536 in probability_two_random_records_match\n",
       "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n",
       "\n",
       "EM converged after 5 iterations\n",
@@ -1143,7 +1143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 31,
    "id": "9581aa18-3352-429a-86c4-6078bcf13a55",
    "metadata": {},
    "outputs": [
@@ -1166,16 +1166,16 @@
       "    - street_address\n",
       "\n",
       "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n",
-      "Iteration 2: Largest change in params was 0.476 in probability_two_random_records_match\n",
-      "Iteration 3: Largest change in params was 0.0397 in probability_two_random_records_match\n",
-      "Iteration 4: Largest change in params was 0.0442 in the m_probability of city, level `All other comparisons`\n",
-      "Iteration 5: Largest change in params was 0.0194 in probability_two_random_records_match\n",
-      "Iteration 6: Largest change in params was 0.00729 in probability_two_random_records_match\n",
-      "Iteration 7: Largest change in params was 0.00274 in probability_two_random_records_match\n",
-      "Iteration 8: Largest change in params was 0.00104 in probability_two_random_records_match\n",
-      "Iteration 9: Largest change in params was 0.000398 in probability_two_random_records_match\n",
-      "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n",
-      "Iteration 11: Largest change in params was 5.88e-05 in probability_two_random_records_match\n",
+      "Iteration 2: Largest change in params was 0.472 in probability_two_random_records_match\n",
+      "Iteration 3: Largest change in params was 0.0399 in probability_two_random_records_match\n",
+      "Iteration 4: Largest change in params was 0.044 in the m_probability of city, level `All other comparisons`\n",
+      "Iteration 5: Largest change in params was 0.0192 in probability_two_random_records_match\n",
+      "Iteration 6: Largest change in params was 0.00723 in probability_two_random_records_match\n",
+      "Iteration 7: Largest change in params was 0.00272 in probability_two_random_records_match\n",
+      "Iteration 8: Largest change in params was 0.00103 in probability_two_random_records_match\n",
+      "Iteration 9: Largest change in params was 0.000394 in probability_two_random_records_match\n",
+      "Iteration 10: Largest change in params was 0.000151 in probability_two_random_records_match\n",
+      "Iteration 11: Largest change in params was 5.82e-05 in probability_two_random_records_match\n",
       "\n",
       "EM converged after 11 iterations\n",
       "\n",
@@ -1192,7 +1192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 32,
    "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13",
    "metadata": {},
    "outputs": [
@@ -1201,23 +1201,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed {\n",
+       "  #altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed details,\n",
-       "  #altair-viz-185f3e4a9af6415baed71f7c69036c99.vega-embed details summary {\n",
+       "  #altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f.vega-embed details,\n",
+       "  #altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-185f3e4a9af6415baed71f7c69036c99\"></div>\n",
+       "<div id=\"altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-185f3e4a9af6415baed71f7c69036c99\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-185f3e4a9af6415baed71f7c69036c99\");\n",
+       "    if (outputDiv.id !== \"altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-a30c4d5d05a54eeeae75cf63ccd3a30f\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1263,14 +1263,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-740da21ad061123ec94a64fd1de6c98f\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-740da21ad061123ec94a64fd1de6c98f\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.3743083676072958e-06, \"log2_bayes_factor\": -18.684061249539493, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  421,176.3 records.This is equivalent to a starting match weight of -18.684.\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058150813342962, \"u_probability\": 1.2180592034555192e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001218% of records (i.e. one in 820,978) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 415263.13326917647, \"log2_bayes_factor\": 18.663666270565923, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 415,263 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023254626281341145, \"u_probability\": 3.8253098951495645e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.825e-05% of records (i.e. one in 2,614,167) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6079.148335361709, \"log2_bayes_factor\": 12.569653506618183, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,079 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860443747032734, \"u_probability\": 0.9999983994098071, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860459529587555, \"log2_bayes_factor\": -0.020273212421214715, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485090659, \"u_probability\": 1.143566326549975e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001144% of records (i.e. one in 87,446) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9605.781693687017, \"log2_bayes_factor\": 13.229687306123738, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,606 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272915167, \"u_probability\": 2.119020349813114e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002119% of records (i.e. one in 47,192) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 402.20129427576404, \"log2_bayes_factor\": 8.651773913989402, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 402 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761782, \"u_probability\": 0.9999673741332363, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816575527180243, \"log2_bayes_factor\": -0.18170969185989266, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.8114435216387779, \"u_probability\": 0.052535716053222686, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.254% of records (i.e. one in 19.03) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.445559375582198, \"log2_bayes_factor\": 3.949120215368288, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.45 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.1885564783612222, \"u_probability\": 0.9474642839467773, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.75% of records (i.e. one in 1.055) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19901170055272938, \"log2_bayes_factor\": -2.3290748408362094, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.025 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839727297704452, \"u_probability\": 0.0067046866658963065, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6705% of records (i.e. one in 149) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 102.01412293426084, \"log2_bayes_factor\": 6.672625083900734, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 102 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021460503551289617, \"u_probability\": 0.0005311505676706402, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05312% of records (i.e. one in 1,883) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 40.40380422712267, \"log2_bayes_factor\": 5.336419231458947, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 40.4 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2945667666782652, \"u_probability\": 0.9927641627664331, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.28% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2967137389986223, \"log2_bayes_factor\": -1.7528563641383663, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.37 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 60, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"mark\": {\"tooltip\": null}, \"title\": {\"anchor\": \"middle\"}}, \"vconcat\": [{\"mark\": {\"type\": \"bar\", \"clip\": true, \"height\": 15}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"probability_two_random_records_match\", \"format\": \".4f\", \"title\": \"Probability two random records match\", \"type\": \"nominal\"}, {\"field\": \"log2_bayes_factor\", \"format\": \",.4f\", \"title\": \"Equivalent match weight\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"domain\": false, \"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"labels\": false, \"ticks\": false, \"title\": \"\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": \"Prior (starting) match weight\", \"titleAlign\": \"right\", \"titleAngle\": 0, \"titleFontWeight\": \"normal\"}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": 20, \"transform\": [{\"filter\": \"(datum.comparison_name == 'probability_two_random_records_match')\"}]}, {\"mark\": {\"type\": \"bar\", \"clip\": true}, \"encoding\": {\"color\": {\"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-10, 0, 10], \"interpolate\": \"lab\", \"range\": [\"red\", \"#bbbbbb\", \"green\"]}, \"title\": \"Match weight\", \"type\": \"quantitative\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"gridColor\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": \"#aaa\"}, \"value\": \"#ddd\"}, \"gridDash\": {\"condition\": {\"test\": \"abs(datum.value / 10) == 1\", \"value\": [3]}, \"value\": null}, \"gridWidth\": {\"condition\": {\"test\": \"abs(datum.value / 10)  <= 1 & datum.value % 10 === 0\", \"value\": 2}, \"value\": 1}, \"title\": \"Comparison level match weight = log2(m/u)\"}, \"field\": \"log2_bayes_factor\", \"scale\": {\"domain\": [-19, 19]}, \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"transform\": [{\"filter\": \"(datum.comparison_name != 'probability_two_random_records_match')\"}]}], \"data\": {\"name\": \"data-3e936f170076224a42263d2902120b87\"}, \"params\": [{\"name\": \"mouse_zoom\", \"select\": {\"type\": \"interval\", \"encodings\": [\"x\"]}, \"bind\": \"scales\", \"views\": []}], \"resolve\": {\"axis\": {\"y\": \"independent\"}, \"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Model parameters (components of final match weight)\", \"subtitle\": \"Use mousewheel to zoom\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-3e936f170076224a42263d2902120b87\": [{\"comparison_name\": \"probability_two_random_records_match\", \"sql_condition\": null, \"label_for_charts\": \"\", \"m_probability\": null, \"u_probability\": null, \"m_probability_description\": null, \"u_probability_description\": null, \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": null, \"is_null_level\": false, \"bayes_factor\": 2.3743083676072958e-06, \"log2_bayes_factor\": -18.684061249539493, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 0, \"bayes_factor_description\": \"The probability that two random records drawn at random match is 0.000 or one in  421,176.3 records.This is equivalent to a starting match weight of -18.684.\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": -1}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151241048487, \"u_probability\": 1.058468514765883e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001058% of records (i.e. one in 944,761) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 477874.5111910365, \"log2_bayes_factor\": 18.866272293791525, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 477,875 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023255330584916914, \"u_probability\": 3.4622801884865333e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2326% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.462e-05% of records (i.e. one in 2,888,270) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6716.767366849798, \"log2_bayes_factor\": 12.713551347144678, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,717 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860442187318108, \"u_probability\": 0.9999985953034664, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860456038266525, \"log2_bayes_factor\": -0.020273723240453193, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092213, \"u_probability\": 1.110897329048679e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001111% of records (i.e. one in 90,017) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9888.266177126492, \"log2_bayes_factor\": 13.271501863589227, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,888 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916373, \"u_probability\": 2.0140578467881662e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002014% of records (i.e. one in 49,651) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 423.16199043178585, \"log2_bayes_factor\": 8.725066236228873, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 423 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761615, \"u_probability\": 0.9999687504482416, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816563392415677, \"log2_bayes_factor\": -0.18171167752617032, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811444988419664, \"u_probability\": 0.05114359014936638, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.114% of records (i.e. one in 19.55) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.866015390194834, \"log2_bayes_factor\": 3.9878679484166293, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.18855501158033613, \"u_probability\": 0.9488564098506336, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871817234181718, \"log2_bayes_factor\": -2.331204284963926, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839748681947907, \"u_probability\": 0.006604962777275461, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6605% of records (i.e. one in 151) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 103.55468929333308, \"log2_bayes_factor\": 6.6942490747388375, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 104 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02146174771532578, \"u_probability\": 0.0004725625881274015, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.59) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04726% of records (i.e. one in 2,116) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 45.41567245170867, \"log2_bayes_factor\": 5.505118336601349, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 45.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29456338408988353, \"u_probability\": 0.9929224746345972, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.29% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2966630241684126, \"log2_bayes_factor\": -1.75310297316913, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.371 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.VConcatChart(...)"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1281,7 +1281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 33,
    "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3",
    "metadata": {},
    "outputs": [
@@ -1290,23 +1290,23 @@
       "text/html": [
        "\n",
        "<style>\n",
-       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed {\n",
+       "  #altair-viz-5edd235dc47d4396873cf579f0a3881f.vega-embed {\n",
        "    width: 100%;\n",
        "    display: flex;\n",
        "  }\n",
        "\n",
-       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed details,\n",
-       "  #altair-viz-750b76fa16304920bb6a14f2dce0c3d7.vega-embed details summary {\n",
+       "  #altair-viz-5edd235dc47d4396873cf579f0a3881f.vega-embed details,\n",
+       "  #altair-viz-5edd235dc47d4396873cf579f0a3881f.vega-embed details summary {\n",
        "    position: relative;\n",
        "  }\n",
        "</style>\n",
-       "<div id=\"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\"></div>\n",
+       "<div id=\"altair-viz-5edd235dc47d4396873cf579f0a3881f\"></div>\n",
        "<script type=\"text/javascript\">\n",
        "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
        "  (function(spec, embedOpt){\n",
        "    let outputDiv = document.currentScript.previousElementSibling;\n",
-       "    if (outputDiv.id !== \"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\") {\n",
-       "      outputDiv = document.getElementById(\"altair-viz-750b76fa16304920bb6a14f2dce0c3d7\");\n",
+       "    if (outputDiv.id !== \"altair-viz-5edd235dc47d4396873cf579f0a3881f\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-5edd235dc47d4396873cf579f0a3881f\");\n",
        "    }\n",
        "    const paths = {\n",
        "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
@@ -1352,14 +1352,14 @@
        "        .catch(showError)\n",
        "        .then(() => displayChart(vegaEmbed));\n",
        "    }\n",
-       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-da21a3ec309ec8fce463d576250e1f0d\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-da21a3ec309ec8fce463d576250e1f0d\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058150813342962, \"u_probability\": 1.2180592034555192e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001218% of records (i.e. one in 820,978) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 415263.13326917647, \"log2_bayes_factor\": 18.663666270565923, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 415,263 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023254626281341145, \"u_probability\": 3.8253098951495645e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2325% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.825e-05% of records (i.e. one in 2,614,167) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6079.148335361709, \"log2_bayes_factor\": 12.569653506618183, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,079 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860443747032734, \"u_probability\": 0.9999983994098071, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860459529587555, \"log2_bayes_factor\": -0.020273212421214715, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485090659, \"u_probability\": 1.143566326549975e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001144% of records (i.e. one in 87,446) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9605.781693687017, \"log2_bayes_factor\": 13.229687306123738, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,606 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272915167, \"u_probability\": 2.119020349813114e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002119% of records (i.e. one in 47,192) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 402.20129427576404, \"log2_bayes_factor\": 8.651773913989402, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 402 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761782, \"u_probability\": 0.9999673741332363, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816575527180243, \"log2_bayes_factor\": -0.18170969185989266, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.8114435216387779, \"u_probability\": 0.052535716053222686, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.254% of records (i.e. one in 19.03) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.445559375582198, \"log2_bayes_factor\": 3.949120215368288, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.45 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.1885564783612222, \"u_probability\": 0.9474642839467773, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.75% of records (i.e. one in 1.055) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19901170055272938, \"log2_bayes_factor\": -2.3290748408362094, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.025 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839727297704452, \"u_probability\": 0.0067046866658963065, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6705% of records (i.e. one in 149) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 102.01412293426084, \"log2_bayes_factor\": 6.672625083900734, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 102 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.021460503551289617, \"u_probability\": 0.0005311505676706402, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.6) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.05312% of records (i.e. one in 1,883) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 40.40380422712267, \"log2_bayes_factor\": 5.336419231458947, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 40.4 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.2945667666782652, \"u_probability\": 0.9927641627664331, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.28% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2967137389986223, \"log2_bayes_factor\": -1.7528563641383663, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.37 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 300, \"continuousHeight\": 300, \"discreteHeight\": 300, \"discreteWidth\": 400}, \"header\": {\"title\": null}, \"title\": {\"anchor\": \"middle\", \"offset\": 10}}, \"hconcat\": [{\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"green\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labelAlign\": \"left\", \"labelAnchor\": \"middle\", \"labelAngle\": 0}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"m_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter due to vega lite issue 4680')\"}], \"width\": 150}, {\"mark\": \"bar\", \"encoding\": {\"color\": {\"value\": \"red\"}, \"row\": {\"field\": \"comparison_name\", \"header\": {\"labels\": false}, \"sort\": {\"field\": \"comparison_sort_order\"}, \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"comparison_name\", \"title\": \"Comparison name\", \"type\": \"nominal\"}, {\"field\": \"label_for_charts\", \"title\": \"Label\", \"type\": \"ordinal\"}, {\"field\": \"sql_condition\", \"title\": \"SQL condition\", \"type\": \"nominal\"}, {\"field\": \"m_probability\", \"format\": \".10~g\", \"title\": \"M probability\", \"type\": \"quantitative\"}, {\"field\": \"u_probability\", \"format\": \".10~g\", \"title\": \"U probability\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor\", \"format\": \",.6f\", \"title\": \"Bayes factor = m/u\", \"type\": \"quantitative\"}, {\"field\": \"log2_bayes_factor\", \"format\": \".4~g\", \"title\": \"Match weight = log2(m/u)\", \"type\": \"quantitative\"}, {\"field\": \"bayes_factor_description\", \"title\": \"Match weight description\", \"type\": \"nominal\"}, {\"field\": \"m_probability_description\", \"title\": \"m probability description\", \"type\": \"nominal\"}, {\"field\": \"u_probability_description\", \"title\": \"u probability description\", \"type\": \"nominal\"}], \"x\": {\"axis\": {\"title\": \"Proportion of record comparisons\"}, \"field\": \"u_probability\", \"type\": \"quantitative\"}, \"y\": {\"axis\": {\"title\": null}, \"field\": \"label_for_charts\", \"sort\": {\"field\": \"comparison_vector_value\", \"order\": \"descending\"}, \"type\": \"nominal\"}}, \"height\": {\"step\": 12}, \"resolve\": {\"scale\": {\"y\": \"independent\"}}, \"title\": {\"text\": \"Amongst non-matching record comparisons:\", \"fontSize\": 12, \"fontWeight\": \"bold\"}, \"transform\": [{\"filter\": \"(datum.bayes_factor != 'no-op filter2 due to vega lite issue 4680')\"}], \"width\": 150}], \"data\": {\"name\": \"data-cfb028fa48bfea4ef958ec7a0190eacf\"}, \"title\": {\"text\": \"Proportion of record comparisons in each comparison level by match status\", \"subtitle\": \"(m and u probabilities)\"}, \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.9.3.json\", \"datasets\": {\"data-cfb028fa48bfea4ef958ec7a0190eacf\": [{\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"\\\"company_name_no_legal_l\\\" = \\\"company_name_no_legal_r\\\"\", \"label_for_charts\": \"Exact match on company_name_no_legal\", \"m_probability\": 0.5058151241048487, \"u_probability\": 1.058468514765883e-06, \"m_probability_description\": \"Amongst matching record comparisons, 50.58% of records (i.e. one in 1.977) are in the exact match on company_name_no_legal comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.0001058% of records (i.e. one in 944,761) are in the exact match on company_name_no_legal comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"company_name_no_legal\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 477874.5111910365, \"log2_bayes_factor\": 18.866272293791525, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on company_name_no_legal` then comparison is 477,875 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"jaro_winkler_similarity(\\\"company_name_no_legal_l\\\", \\\"company_name_no_legal_r\\\") >= 0.95\", \"label_for_charts\": \"Jaro-Winkler distance of company_name_no_legal >= 0.95\", \"m_probability\": 0.0023255330584916914, \"u_probability\": 3.4622801884865333e-07, \"m_probability_description\": \"Amongst matching record comparisons, 0.2326% of records (i.e. one in 430) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 3.462e-05% of records (i.e. one in 2,888,270) are in the jaro-winkler distance of company_name_no_legal >= 0.95 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 6716.767366849798, \"log2_bayes_factor\": 12.713551347144678, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of company_name_no_legal >= 0.95` then comparison is 6,717 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"company_name_no_legal\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.9860442187318108, \"u_probability\": 0.9999985953034664, \"m_probability_description\": \"Amongst matching record comparisons, 98.6% of records (i.e. one in 1.014) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.9860456038266525, \"log2_bayes_factor\": -0.020273723240453193, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.014 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 0}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"\\\"street_address_l\\\" = \\\"street_address_r\\\"\", \"label_for_charts\": \"Exact match on street_address\", \"m_probability\": 0.10984848485092213, \"u_probability\": 1.110897329048679e-05, \"m_probability_description\": \"Amongst matching record comparisons, 10.98% of records (i.e. one in 9.103) are in the exact match on street_address comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.001111% of records (i.e. one in 90,017) are in the exact match on street_address comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"street_address\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 9888.266177126492, \"log2_bayes_factor\": 13.271501863589227, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on street_address` then comparison is 9,888 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"levenshtein(\\\"street_address_l\\\", \\\"street_address_r\\\") <= 1\", \"label_for_charts\": \"Levenshtein distance of street_address <= 1\", \"m_probability\": 0.008522727272916373, \"u_probability\": 2.0140578467881662e-05, \"m_probability_description\": \"Amongst matching record comparisons, 0.8523% of records (i.e. one in 117) are in the levenshtein distance of street_address <= 1 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.002014% of records (i.e. one in 49,651) are in the levenshtein distance of street_address <= 1 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 423.16199043178585, \"log2_bayes_factor\": 8.725066236228873, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `levenshtein distance of street_address <= 1` then comparison is 423 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"street_address\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.8816287878761615, \"u_probability\": 0.9999687504482416, \"m_probability_description\": \"Amongst matching record comparisons, 88.16% of records (i.e. one in 1.134) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 100% of records (i.e. one in 1) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.8816563392415677, \"log2_bayes_factor\": -0.18171167752617032, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 1.134 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 1}, {\"comparison_name\": \"state\", \"sql_condition\": \"\\\"state_l\\\" = \\\"state_r\\\"\", \"label_for_charts\": \"Exact match on state\", \"m_probability\": 0.811444988419664, \"u_probability\": 0.05114359014936638, \"m_probability_description\": \"Amongst matching record comparisons, 81.14% of records (i.e. one in 1.232) are in the exact match on state comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 5.114% of records (i.e. one in 19.55) are in the exact match on state comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"state\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 15.866015390194834, \"log2_bayes_factor\": 3.9878679484166293, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `exact match on state` then comparison is 15.87 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"state\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.18855501158033613, \"u_probability\": 0.9488564098506336, \"m_probability_description\": \"Amongst matching record comparisons, 18.86% of records (i.e. one in 5.303) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 94.89% of records (i.e. one in 1.054) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.19871817234181718, \"log2_bayes_factor\": -2.331204284963926, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 1, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 5.032 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 2}, {\"comparison_name\": \"city\", \"sql_condition\": \"\\\"city_l\\\" = \\\"city_r\\\"\", \"label_for_charts\": \"Exact match on city\", \"m_probability\": 0.6839748681947907, \"u_probability\": 0.006604962777275461, \"m_probability_description\": \"Amongst matching record comparisons, 68.4% of records (i.e. one in 1.462) are in the exact match on city comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.6605% of records (i.e. one in 151) are in the exact match on city comparison level\", \"has_tf_adjustments\": true, \"tf_adjustment_column\": \"city\", \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 103.55468929333308, \"log2_bayes_factor\": 6.6942490747388375, \"comparison_vector_value\": 2, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `exact match on city` then comparison is 104 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"jaro_winkler_similarity(\\\"city_l\\\", \\\"city_r\\\") >= 0.9\", \"label_for_charts\": \"Jaro-Winkler distance of city >= 0.9\", \"m_probability\": 0.02146174771532578, \"u_probability\": 0.0004725625881274015, \"m_probability_description\": \"Amongst matching record comparisons, 2.146% of records (i.e. one in 46.59) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 0.04726% of records (i.e. one in 2,116) are in the jaro-winkler distance of city >= 0.9 comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 45.41567245170867, \"log2_bayes_factor\": 5.505118336601349, \"comparison_vector_value\": 1, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `jaro-winkler distance of city >= 0.9` then comparison is 45.42 times more likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}, {\"comparison_name\": \"city\", \"sql_condition\": \"ELSE\", \"label_for_charts\": \"All other comparisons\", \"m_probability\": 0.29456338408988353, \"u_probability\": 0.9929224746345972, \"m_probability_description\": \"Amongst matching record comparisons, 29.46% of records (i.e. one in 3.395) are in the all other comparisons comparison level\", \"u_probability_description\": \"Amongst non-matching record comparisons, 99.29% of records (i.e. one in 1.007) are in the all other comparisons comparison level\", \"has_tf_adjustments\": false, \"tf_adjustment_column\": null, \"tf_adjustment_weight\": 1.0, \"is_null_level\": false, \"bayes_factor\": 0.2966630241684126, \"log2_bayes_factor\": -1.75310297316913, \"comparison_vector_value\": 0, \"max_comparison_vector_value\": 2, \"bayes_factor_description\": \"If comparison level is `all other comparisons` then comparison is 3.371 times less likely to be a match\", \"probability_two_random_records_match\": 2.374302730280456e-06, \"comparison_sort_order\": 3}]}}, {\"mode\": \"vega-lite\"});\n",
        "</script>"
       ],
       "text/plain": [
        "alt.HConcatChart(...)"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1391,7 +1391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 34,
    "id": "94e96441-89b6-4516-aa6a-4d1593ce03be",
    "metadata": {},
    "outputs": [
@@ -1399,7 +1399,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Blocking time: 0.16 seconds\n",
+      "Blocking time: 0.14 seconds\n",
       "Predict time: 0.26 seconds\n"
      ]
     }
@@ -1410,7 +1410,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 35,
    "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0",
    "metadata": {},
    "outputs": [],
@@ -1420,7 +1420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 36,
    "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e",
    "metadata": {},
    "outputs": [
@@ -1487,8 +1487,8 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>295287</th>\n",
-       "      <td>-22.967975</td>\n",
-       "      <td>1.218850e-07</td>\n",
+       "      <td>-22.970354</td>\n",
+       "      <td>1.216843e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
        "      <td>56230</td>\n",
@@ -1505,184 +1505,184 @@
        "      <td>0</td>\n",
        "      <td>0.000049</td>\n",
        "      <td>0.000049</td>\n",
-       "      <td>0.881658</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>ne</td>\n",
        "      <td>mo</td>\n",
        "      <td>0</td>\n",
        "      <td>0.006455</td>\n",
        "      <td>0.010118</td>\n",
-       "      <td>0.199012</td>\n",
+       "      <td>0.198718</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>omaha</td>\n",
        "      <td>st louis</td>\n",
        "      <td>0</td>\n",
        "      <td>0.003448</td>\n",
        "      <td>0.002764</td>\n",
-       "      <td>0.296714</td>\n",
+       "      <td>0.296663</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>UNN PSFK</td>\n",
        "      <td>UNN ELKTRK</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>384509</th>\n",
-       "      <td>-22.967975</td>\n",
-       "      <td>1.218850e-07</td>\n",
+       "      <th>307206</th>\n",
+       "      <td>-22.970354</td>\n",
+       "      <td>1.216843e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>56484</td>\n",
-       "      <td>19138</td>\n",
-       "      <td>united states lime and minerals</td>\n",
-       "      <td>united water conservation</td>\n",
+       "      <td>29764</td>\n",
+       "      <td>9337</td>\n",
+       "      <td>international lease finance</td>\n",
+       "      <td>international paper riegel</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>5429 lbj fwy</td>\n",
-       "      <td>1701 north lombard st</td>\n",
+       "      <td>1999 ave of the stars</td>\n",
+       "      <td>6400 poplar ave</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.881658</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>tx</td>\n",
        "      <td>ca</td>\n",
+       "      <td>tn</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.079841</td>\n",
        "      <td>0.157960</td>\n",
-       "      <td>0.199012</td>\n",
+       "      <td>0.010622</td>\n",
+       "      <td>0.198718</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>dallas</td>\n",
-       "      <td>oxnard</td>\n",
+       "      <td>los angeles</td>\n",
+       "      <td>memphis</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.013855</td>\n",
-       "      <td>0.000257</td>\n",
-       "      <td>0.296714</td>\n",
+       "      <td>0.008107</td>\n",
+       "      <td>0.001357</td>\n",
+       "      <td>0.296663</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>UNTT STTS LM ANT MNRLS</td>\n",
-       "      <td>UNTT WTR KNSRFXN</td>\n",
+       "      <td>INTRNXNL LS FNNS</td>\n",
+       "      <td>INTRNXNL PPR RJL</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>384504</th>\n",
-       "      <td>-22.967975</td>\n",
-       "      <td>1.218850e-07</td>\n",
+       "      <th>307205</th>\n",
+       "      <td>-22.970354</td>\n",
+       "      <td>1.216843e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>56436</td>\n",
-       "      <td>19138</td>\n",
-       "      <td>united rentals</td>\n",
-       "      <td>united water conservation</td>\n",
+       "      <td>29818</td>\n",
+       "      <td>9337</td>\n",
+       "      <td>international speedway</td>\n",
+       "      <td>international paper riegel</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.000012</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>100 first stamford pl</td>\n",
-       "      <td>1701 north lombard st</td>\n",
+       "      <td>1801 w international speedway blvd</td>\n",
+       "      <td>6400 poplar ave</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000122</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.881658</td>\n",
+       "      <td>0.000061</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ct</td>\n",
-       "      <td>ca</td>\n",
+       "      <td>fl</td>\n",
+       "      <td>tn</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.020876</td>\n",
-       "      <td>0.157960</td>\n",
-       "      <td>0.199012</td>\n",
+       "      <td>0.048477</td>\n",
+       "      <td>0.010622</td>\n",
+       "      <td>0.198718</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>stamford</td>\n",
-       "      <td>oxnard</td>\n",
+       "      <td>daytona beach</td>\n",
+       "      <td>memphis</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.003950</td>\n",
-       "      <td>0.000257</td>\n",
-       "      <td>0.296714</td>\n",
+       "      <td>0.000245</td>\n",
+       "      <td>0.001357</td>\n",
+       "      <td>0.296663</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>UNTT RNTLS</td>\n",
-       "      <td>UNTT WTR KNSRFXN</td>\n",
+       "      <td>INTRNXNL SPTW</td>\n",
+       "      <td>INTRNXNL PPR RJL</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>384503</th>\n",
-       "      <td>-22.967975</td>\n",
-       "      <td>1.218850e-07</td>\n",
+       "      <th>307204</th>\n",
+       "      <td>-22.970354</td>\n",
+       "      <td>1.216843e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>56424</td>\n",
-       "      <td>19138</td>\n",
-       "      <td>united parcel service</td>\n",
-       "      <td>united water conservation</td>\n",
+       "      <td>59433</td>\n",
+       "      <td>20092</td>\n",
+       "      <td>west penn funding</td>\n",
+       "      <td>west line solar</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>55 glenlake pkwy ne</td>\n",
-       "      <td>1701 north lombard st</td>\n",
+       "      <td>2325b2 renaissance dr</td>\n",
+       "      <td>2180 south 1300 east</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.881658</td>\n",
+       "      <td>0.000110</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>ga</td>\n",
-       "      <td>ca</td>\n",
+       "      <td>nv</td>\n",
+       "      <td>ut</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.018626</td>\n",
-       "      <td>0.157960</td>\n",
-       "      <td>0.199012</td>\n",
+       "      <td>0.020458</td>\n",
+       "      <td>0.010549</td>\n",
+       "      <td>0.198718</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>atlanta</td>\n",
-       "      <td>oxnard</td>\n",
+       "      <td>las vegas</td>\n",
+       "      <td>salt lake city</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.008462</td>\n",
-       "      <td>0.000257</td>\n",
-       "      <td>0.296714</td>\n",
+       "      <td>0.010724</td>\n",
+       "      <td>0.005772</td>\n",
+       "      <td>0.296663</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>UNTT PRSL SRFS</td>\n",
-       "      <td>UNTT WTR KNSRFXN</td>\n",
+       "      <td>WST PN FNTNK</td>\n",
+       "      <td>WST LN SLR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>384502</th>\n",
-       "      <td>-22.967975</td>\n",
-       "      <td>1.218850e-07</td>\n",
+       "      <th>307203</th>\n",
+       "      <td>-22.970354</td>\n",
+       "      <td>1.216843e-07</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>56312</td>\n",
-       "      <td>19138</td>\n",
-       "      <td>united bancorp /oh/</td>\n",
-       "      <td>united water conservation</td>\n",
+       "      <td>39648</td>\n",
+       "      <td>12908</td>\n",
+       "      <td>north country financial</td>\n",
+       "      <td>north american energy services</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000110</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>201 south fourth st</td>\n",
-       "      <td>1701 north lombard st</td>\n",
+       "      <td>3530 north country dr</td>\n",
+       "      <td>1070 erie ave</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.881658</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000037</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>oh</td>\n",
-       "      <td>ca</td>\n",
+       "      <td>mi</td>\n",
+       "      <td>ny</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.016991</td>\n",
-       "      <td>0.157960</td>\n",
-       "      <td>0.199012</td>\n",
+       "      <td>0.015147</td>\n",
+       "      <td>0.120228</td>\n",
+       "      <td>0.198718</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>martins ferry</td>\n",
-       "      <td>oxnard</td>\n",
+       "      <td>traverse city</td>\n",
+       "      <td>north tonawanda</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000257</td>\n",
-       "      <td>0.296714</td>\n",
+       "      <td>0.000269</td>\n",
+       "      <td>0.000049</td>\n",
+       "      <td>0.296663</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>UNTT BNKRP</td>\n",
-       "      <td>UNTT WTR KNSRFXN</td>\n",
+       "      <td>NR0 KNTR FNNXL</td>\n",
+       "      <td>NR0 AMRKN ENRJ SRFSS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -1727,7 +1727,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>163815</th>\n",
-       "      <td>27.519606</td>\n",
+       "      <td>27.519613</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -1738,36 +1738,36 @@
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.016616</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.014439</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.311992</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.303079</td>\n",
        "      <td>sd</td>\n",
        "      <td>sd</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001930</td>\n",
        "      <td>0.001930</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>27.217182</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>26.495963</td>\n",
        "      <td>huron</td>\n",
        "      <td>huron</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>91.382644</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>90.023441</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>241593</th>\n",
-       "      <td>27.526514</td>\n",
+       "      <td>27.526521</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -1778,36 +1778,36 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.033231</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.028877</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.311992</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.303079</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>34.184780</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>33.278930</td>\n",
        "      <td>colchester</td>\n",
        "      <td>colchester</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000183</td>\n",
        "      <td>0.000183</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>36.553058</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>36.009376</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>165487</th>\n",
-       "      <td>27.757338</td>\n",
+       "      <td>27.757345</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -1818,36 +1818,36 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.049847</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.043316</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>wi</td>\n",
        "      <td>wi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.008840</td>\n",
        "      <td>0.008840</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>5.943112</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>5.785628</td>\n",
        "      <td>wausau</td>\n",
        "      <td>wausau</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>109.659173</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>108.028129</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>340414</th>\n",
-       "      <td>27.884365</td>\n",
+       "      <td>27.884373</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -1858,36 +1858,36 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.049847</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.043316</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>mo</td>\n",
        "      <td>mo</td>\n",
        "      <td>1</td>\n",
        "      <td>0.010118</td>\n",
        "      <td>0.010118</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>5.192099</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>5.054515</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000049</td>\n",
        "      <td>0.000049</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>137.073967</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>135.035162</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>274760</th>\n",
-       "      <td>29.211012</td>\n",
+       "      <td>29.211020</td>\n",
        "      <td>1.000000e+00</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -1898,29 +1898,29 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.033231</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.028877</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>34.184780</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>33.278930</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000086</td>\n",
        "      <td>0.000086</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>78.327981</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>77.162949</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>0</td>\n",
@@ -1931,23 +1931,23 @@
        "</div>"
       ],
       "text/plain": [
-       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r          company_name_no_legal_l      company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state         city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city   company_name_mphone_l company_name_mphone_r match_key\n",
-       "295287    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56230        19078                    union pacific               union electric                            0                    0.000049                    0.000098                  0.986046                         1.000000          1416 dodge st                mc 1400                     0             0.000049             0.000049           0.881658                  1.000000      ne      mo            0    0.006455    0.010118   0.199012         1.000000          omaha     st louis           0   0.003448   0.002764    0.296714        1.000000                UNN PSFK            UNN ELKTRK         0\n",
-       "384509    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56484        19138  united states lime and minerals    united water conservation                            0                    0.000037                    0.000024                  0.986046                         1.000000           5429 lbj fwy  1701 north lombard st                     0             0.000024             0.000012           0.881658                  1.000000      tx      ca            0    0.079841    0.157960   0.199012         1.000000         dallas       oxnard           0   0.013855   0.000257    0.296714        1.000000  UNTT STTS LM ANT MNRLS      UNTT WTR KNSRFXN         0\n",
-       "384504    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56436        19138                   united rentals    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000  100 first stamford pl  1701 north lombard st                     0             0.000122             0.000012           0.881658                  1.000000      ct      ca            0    0.020876    0.157960   0.199012         1.000000       stamford       oxnard           0   0.003950   0.000257    0.296714        1.000000              UNTT RNTLS      UNTT WTR KNSRFXN         0\n",
-       "384503    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56424        19138            united parcel service    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000    55 glenlake pkwy ne  1701 north lombard st                     0             0.000012             0.000012           0.881658                  1.000000      ga      ca            0    0.018626    0.157960   0.199012         1.000000        atlanta       oxnard           0   0.008462   0.000257    0.296714        1.000000          UNTT PRSL SRFS      UNTT WTR KNSRFXN         0\n",
-       "384502    -22.967975       1.218850e-07  __splink__input_table_0  __splink__input_table_1        56312        19138              united bancorp /oh/    united water conservation                            0                    0.000024                    0.000024                  0.986046                         1.000000    201 south fourth st  1701 north lombard st                     0             0.000012             0.000012           0.881658                  1.000000      oh      ca            0    0.016991    0.157960   0.199012         1.000000  martins ferry       oxnard           0   0.000024   0.000257    0.296714        1.000000             UNTT BNKRP       UNTT WTR KNSRFXN         0\n",
-       "...              ...                ...                      ...                      ...          ...          ...                              ...                          ...                          ...                         ...                         ...                       ...                              ...                    ...                    ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...            ...          ...         ...        ...        ...         ...             ...                     ...                   ...       ...\n",
-       "163815     27.519606       1.000000e+00  __splink__input_table_0  __splink__input_table_1        39816        13109      northwestern public service  northwestern public service                            2                    0.000073                    0.000073             415263.133269                         0.016616         33 third st se         33 third st se                     2             0.000037             0.000037        9605.781694                  0.311992      sd      sd            1    0.001930    0.001930  15.445559        27.217182          huron        huron           2   0.000073   0.000073  102.014123       91.382644      NR0WSTRN PBLK SRFS    NR0WSTRN PBLK SRFS         0\n",
-       "241593     27.526514       1.000000e+00  __splink__input_table_0  __splink__input_table_1        24650         8047             green mountain power         green mountain power                            2                    0.000037                    0.000037             415263.133269                         0.033231           163 acorn ln           163 acorn ln                     2             0.000037             0.000037        9605.781694                  0.311992      vt      vt            1    0.001537    0.001537  15.445559        34.184780     colchester   colchester           2   0.000183   0.000183  102.014123       36.553058            KRN MNTN PWR          KRN MNTN PWR         0\n",
-       "165487     27.757338       1.000000e+00  __splink__input_table_0  __splink__input_table_1        58842        19906               wausau paper mills           wausau paper mills                            2                    0.000024                    0.000024             415263.133269                         0.049847          one clarks is          one clarks is                     2             0.000024             0.000024        9605.781694                  0.467987      wi      wi            1    0.008840    0.008840  15.445559         5.943112         wausau       wausau           2   0.000061   0.000061  102.014123      109.659173              WS PPR MLS            WS PPR MLS         0\n",
-       "340414     27.884365       1.000000e+00  __splink__input_table_0  __splink__input_table_1        51567        17450        st joseph light and power    st joseph light and power                            2                    0.000024                    0.000024             415263.133269                         0.049847         520 francis st         520 francis st                     2             0.000024             0.000024        9605.781694                  0.467987      mo      mo            1    0.010118    0.010118  15.445559         5.192099      st joseph    st joseph           2   0.000049   0.000049  102.014123      137.073967       ST JSF LT ANT PWR     ST JSF LT ANT PWR         0\n",
-       "274760     29.211012       1.000000e+00  __splink__input_table_0  __splink__input_table_1        20588         6741                        fibermark                    fibermark                            2                    0.000037                    0.000037             415263.133269                         0.033231      161 wellington rd      161 wellington rd                     2             0.000024             0.000024        9605.781694                  0.467987      vt      vt            1    0.001537    0.001537  15.445559        34.184780    brattleboro  brattleboro           2   0.000086   0.000086  102.014123       78.327981                  FBRMRK                FBRMRK         0\n",
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r      company_name_no_legal_l         company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal                    street_address_l      street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state         city_l           city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n",
+       "295287    -22.970354       1.216843e-07  __splink__input_table_0  __splink__input_table_1        56230        19078                union pacific                  union electric                            0                    0.000049                    0.000098                  0.986046                         1.000000                       1416 dodge st               mc 1400                     0             0.000049             0.000049           0.881656                  1.000000      ne      mo            0    0.006455    0.010118   0.198718         1.000000          omaha         st louis           0   0.003448   0.002764    0.296663        1.000000              UNN PSFK            UNN ELKTRK         0\n",
+       "307206    -22.970354       1.216843e-07  __splink__input_table_0  __splink__input_table_1        29764         9337  international lease finance      international paper riegel                            0                    0.000037                    0.000012                  0.986046                         1.000000               1999 ave of the stars       6400 poplar ave                     0             0.000110             0.000061           0.881656                  1.000000      ca      tn            0    0.157960    0.010622   0.198718         1.000000    los angeles          memphis           0   0.008107   0.001357    0.296663        1.000000      INTRNXNL LS FNNS      INTRNXNL PPR RJL         0\n",
+       "307205    -22.970354       1.216843e-07  __splink__input_table_0  __splink__input_table_1        29818         9337       international speedway      international paper riegel                            0                    0.000037                    0.000012                  0.986046                         1.000000  1801 w international speedway blvd       6400 poplar ave                     0             0.000012             0.000061           0.881656                  1.000000      fl      tn            0    0.048477    0.010622   0.198718         1.000000  daytona beach          memphis           0   0.000245   0.001357    0.296663        1.000000         INTRNXNL SPTW      INTRNXNL PPR RJL         0\n",
+       "307204    -22.970354       1.216843e-07  __splink__input_table_0  __splink__input_table_1        59433        20092            west penn funding                 west line solar                            0                    0.000024                    0.000012                  0.986046                         1.000000               2325b2 renaissance dr  2180 south 1300 east                     0             0.000012             0.000110           0.881656                  1.000000      nv      ut            0    0.020458    0.010549   0.198718         1.000000      las vegas   salt lake city           0   0.010724   0.005772    0.296663        1.000000          WST PN FNTNK            WST LN SLR         0\n",
+       "307203    -22.970354       1.216843e-07  __splink__input_table_0  __splink__input_table_1        39648        12908      north country financial  north american energy services                            0                    0.000024                    0.000110                  0.986046                         1.000000               3530 north country dr         1070 erie ave                     0             0.000024             0.000037           0.881656                  1.000000      mi      ny            0    0.015147    0.120228   0.198718         1.000000  traverse city  north tonawanda           0   0.000269   0.000049    0.296663        1.000000        NR0 KNTR FNNXL  NR0 AMRKN ENRJ SRFSS         0\n",
+       "...              ...                ...                      ...                      ...          ...          ...                          ...                             ...                          ...                         ...                         ...                       ...                              ...                                 ...                   ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...            ...              ...         ...        ...        ...         ...             ...                   ...                   ...       ...\n",
+       "163815     27.519613       1.000000e+00  __splink__input_table_0  __splink__input_table_1        39816        13109  northwestern public service     northwestern public service                            2                    0.000073                    0.000073             477874.511191                         0.014439                      33 third st se        33 third st se                     2             0.000037             0.000037        9888.266177                  0.303079      sd      sd            1    0.001930    0.001930  15.866015        26.495963          huron            huron           2   0.000073   0.000073  103.554689       90.023441    NR0WSTRN PBLK SRFS    NR0WSTRN PBLK SRFS         0\n",
+       "241593     27.526521       1.000000e+00  __splink__input_table_0  __splink__input_table_1        24650         8047         green mountain power            green mountain power                            2                    0.000037                    0.000037             477874.511191                         0.028877                        163 acorn ln          163 acorn ln                     2             0.000037             0.000037        9888.266177                  0.303079      vt      vt            1    0.001537    0.001537  15.866015        33.278930     colchester       colchester           2   0.000183   0.000183  103.554689       36.009376          KRN MNTN PWR          KRN MNTN PWR         0\n",
+       "165487     27.757345       1.000000e+00  __splink__input_table_0  __splink__input_table_1        58842        19906           wausau paper mills              wausau paper mills                            2                    0.000024                    0.000024             477874.511191                         0.043316                       one clarks is         one clarks is                     2             0.000024             0.000024        9888.266177                  0.454618      wi      wi            1    0.008840    0.008840  15.866015         5.785628         wausau           wausau           2   0.000061   0.000061  103.554689      108.028129            WS PPR MLS            WS PPR MLS         0\n",
+       "340414     27.884373       1.000000e+00  __splink__input_table_0  __splink__input_table_1        51567        17450    st joseph light and power       st joseph light and power                            2                    0.000024                    0.000024             477874.511191                         0.043316                      520 francis st        520 francis st                     2             0.000024             0.000024        9888.266177                  0.454618      mo      mo            1    0.010118    0.010118  15.866015         5.054515      st joseph        st joseph           2   0.000049   0.000049  103.554689      135.035162     ST JSF LT ANT PWR     ST JSF LT ANT PWR         0\n",
+       "274760     29.211020       1.000000e+00  __splink__input_table_0  __splink__input_table_1        20588         6741                    fibermark                       fibermark                            2                    0.000037                    0.000037             477874.511191                         0.028877                   161 wellington rd     161 wellington rd                     2             0.000024             0.000024        9888.266177                  0.454618      vt      vt            1    0.001537    0.001537  15.866015        33.278930    brattleboro      brattleboro           2   0.000086   0.000086  103.554689       77.162949                FBRMRK                FBRMRK         0\n",
        "\n",
        "[590575 rows x 37 columns]"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1958,7 +1958,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 37,
    "id": "c0b292c8-26ed-407a-866e-75851577d567",
    "metadata": {},
    "outputs": [],
@@ -1972,7 +1972,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 38,
    "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0",
    "metadata": {},
    "outputs": [],
@@ -1985,7 +1985,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 39,
    "id": "5103190c-3775-427f-a8f2-cc8a8f79892b",
    "metadata": {},
    "outputs": [],
@@ -1997,7 +1997,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 40,
    "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8",
    "metadata": {},
    "outputs": [
@@ -2070,8 +2070,8 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>218797</th>\n",
-       "      <td>3.824578</td>\n",
-       "      <td>0.934072</td>\n",
+       "      <td>3.824584</td>\n",
+       "      <td>0.934073</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
        "      <td>14692</td>\n",
@@ -2088,22 +2088,22 @@
        "      <td>2</td>\n",
        "      <td>0.000122</td>\n",
        "      <td>0.000122</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.093597</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.090924</td>\n",
        "      <td>ct</td>\n",
        "      <td>ct</td>\n",
        "      <td>1</td>\n",
        "      <td>0.020876</td>\n",
        "      <td>0.020876</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>2.516547</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>2.449862</td>\n",
        "      <td>stamford</td>\n",
        "      <td>stamford</td>\n",
        "      <td>2</td>\n",
        "      <td>0.003950</td>\n",
        "      <td>0.003950</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>1.697510</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>1.672262</td>\n",
        "      <td>KRN</td>\n",
        "      <td>ENTRJ NKLR PWR MRKTNK</td>\n",
        "      <td>1</td>\n",
@@ -2116,7 +2116,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>220036</th>\n",
-       "      <td>4.619987</td>\n",
+       "      <td>4.619994</td>\n",
        "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2134,22 +2134,22 @@
        "      <td>2</td>\n",
        "      <td>0.000330</td>\n",
        "      <td>0.000330</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.034666</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.033675</td>\n",
        "      <td>mi</td>\n",
        "      <td>mi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.015147</td>\n",
        "      <td>0.015147</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>3.468423</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>3.376515</td>\n",
        "      <td>detroit</td>\n",
        "      <td>detroit</td>\n",
        "      <td>2</td>\n",
        "      <td>0.001162</td>\n",
        "      <td>0.001162</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>5.771535</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>5.685691</td>\n",
        "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
        "      <td>TT SSTNBL JNRXN</td>\n",
        "      <td>1</td>\n",
@@ -2161,8 +2161,8 @@
        "      <td>64331</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>358152</th>\n",
-       "      <td>4.619987</td>\n",
+       "      <th>481032</th>\n",
+       "      <td>4.619994</td>\n",
        "      <td>0.960922</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -2180,22 +2180,22 @@
        "      <td>2</td>\n",
        "      <td>0.000330</td>\n",
        "      <td>0.000330</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.034666</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.033675</td>\n",
        "      <td>mi</td>\n",
        "      <td>mi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.015147</td>\n",
        "      <td>0.015147</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>3.468423</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>3.376515</td>\n",
        "      <td>detroit</td>\n",
        "      <td>detroit</td>\n",
        "      <td>2</td>\n",
        "      <td>0.001162</td>\n",
        "      <td>0.001162</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>5.771535</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>5.685691</td>\n",
        "      <td>TT ELKTRK SKRTSXN FNTNK I</td>\n",
        "      <td>TT ELKTRK</td>\n",
        "      <td>0</td>\n",
@@ -2212,12 +2212,12 @@
       ],
       "text/plain": [
        "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                company_name_no_legal_l          company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal       street_address_l       street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state    city_l    city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city      company_name_mphone_l  company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                           company_name_raw  record_id_y  utility_id_eia\n",
-       "218797      3.824578           0.934072  __splink__input_table_0  __splink__input_table_1        14692         6293                                  crane  entergy nuclear power marketing                            0                    0.000012                    0.000012                  0.986046                              1.0  100 first stamford pl  100 first stamford pl                     2             0.000122             0.000122        9605.781694                  0.093597      ct      ct            1    0.020876    0.020876  15.445559         2.516547  stamford  stamford           2   0.003950   0.003950  102.014123        1.697510                        KRN  ENTRJ NKLR PWR MRKTNK         1        14692     0001944013        0001944013                                   crane co         6293           55243\n",
-       "220036      4.619987           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5535  dte electric securitization funding i       dte sustainable generation                            0                    0.000012                    0.000012                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9605.781694                  0.034666      mi      mi            1    0.015147    0.015147  15.445559         3.468423   detroit   detroit           2   0.001162   0.001162  102.014123        5.771535  TT ELKTRK SKRTSXN FNTNK I        TT SSTNBL JNRXN         1        17752     0001876068        0001876068  dte electric securitization funding i llc         5535           64331\n",
-       "358152      4.619987           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5522  dte electric securitization funding i                     dte electric                            0                    0.000012                    0.000037                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9605.781694                  0.034666      mi      mi            1    0.015147    0.015147  15.445559         3.468423   detroit   detroit           2   0.001162   0.001162  102.014123        5.771535  TT ELKTRK SKRTSXN FNTNK I              TT ELKTRK         0        17752     0001876068        0001876068  dte electric securitization funding i llc         5522            5109"
+       "218797      3.824584           0.934073  __splink__input_table_0  __splink__input_table_1        14692         6293                                  crane  entergy nuclear power marketing                            0                    0.000012                    0.000012                  0.986046                              1.0  100 first stamford pl  100 first stamford pl                     2             0.000122             0.000122        9888.266177                  0.090924      ct      ct            1    0.020876    0.020876  15.866015         2.449862  stamford  stamford           2   0.003950   0.003950  103.554689        1.672262                        KRN  ENTRJ NKLR PWR MRKTNK         1        14692     0001944013        0001944013                                   crane co         6293           55243\n",
+       "220036      4.619994           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5535  dte electric securitization funding i       dte sustainable generation                            0                    0.000012                    0.000012                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9888.266177                  0.033675      mi      mi            1    0.015147    0.015147  15.866015         3.376515   detroit   detroit           2   0.001162   0.001162  103.554689        5.685691  TT ELKTRK SKRTSXN FNTNK I        TT SSTNBL JNRXN         1        17752     0001876068        0001876068  dte electric securitization funding i llc         5535           64331\n",
+       "481032      4.619994           0.960922  __splink__input_table_0  __splink__input_table_1        17752         5522  dte electric securitization funding i                     dte electric                            0                    0.000012                    0.000037                  0.986046                              1.0         one energy plz         one energy plz                     2             0.000330             0.000330        9888.266177                  0.033675      mi      mi            1    0.015147    0.015147  15.866015         3.376515   detroit   detroit           2   0.001162   0.001162  103.554689        5.685691  TT ELKTRK SKRTSXN FNTNK I              TT ELKTRK         0        17752     0001876068        0001876068  dte electric securitization funding i llc         5522            5109"
       ]
      },
-     "execution_count": 46,
+     "execution_count": 40,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2236,7 +2236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 41,
    "id": "11190456-12a9-49df-b863-7a6f674e39eb",
    "metadata": {},
    "outputs": [],
@@ -2246,7 +2246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 42,
    "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81",
    "metadata": {},
    "outputs": [],
@@ -2256,7 +2256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 43,
    "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b",
    "metadata": {},
    "outputs": [],
@@ -2271,7 +2271,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 44,
    "id": "4d45f339-7a5b-466a-81f5-c71e425a77df",
    "metadata": {},
    "outputs": [],
@@ -2281,7 +2281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 45,
    "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5",
    "metadata": {},
    "outputs": [],
@@ -2294,7 +2294,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 46,
    "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5",
    "metadata": {},
    "outputs": [
@@ -2384,7 +2384,7 @@
        "      <td>1</td>\n",
        "      <td>13310.0</td>\n",
        "      <td>4281.0</td>\n",
-       "      <td>0.999982</td>\n",
+       "      <td>0.999984</td>\n",
        "      <td>1.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -2398,7 +2398,7 @@
        "      <td>1</td>\n",
        "      <td>17793.0</td>\n",
        "      <td>5564.0</td>\n",
-       "      <td>0.927293</td>\n",
+       "      <td>0.927294</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -2496,7 +2496,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6780.0</td>\n",
-       "      <td>0.986542</td>\n",
+       "      <td>0.986543</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -2510,7 +2510,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6763.0</td>\n",
-       "      <td>0.085466</td>\n",
+       "      <td>0.085467</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -2594,7 +2594,7 @@
        "      <td>1</td>\n",
        "      <td>40084.0</td>\n",
        "      <td>13243.0</td>\n",
-       "      <td>0.999813</td>\n",
+       "      <td>0.999834</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -2636,7 +2636,7 @@
        "      <td>1</td>\n",
        "      <td>49303.0</td>\n",
        "      <td>16270.0</td>\n",
-       "      <td>0.559071</td>\n",
+       "      <td>0.559072</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -2678,30 +2678,30 @@
        "0         0000003153             195                 alabama power co                                    NaN      1       1701.0        478.0           1.000000                          2.0       both              1.0\n",
        "1         0001868941           58702             fluence energy, inc.                                Fluence      0      21792.0       6889.0           0.016529                          0.0       both              0.0\n",
        "2         0000041091            7140                 georgia power co                                    NaN      1      23416.0       7653.0           0.999997                          2.0       both              1.0\n",
-       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1      13310.0       4281.0           0.999982                          1.0       both              1.0\n",
-       "4         0001326160            5416                 duke energy corp                                    NaN      1      17793.0       5564.0           0.927293                          2.0       both              0.0\n",
+       "3         0000022198            4062  columbus southern power co /oh/             Columbus Southern Power Co      1      13310.0       4281.0           0.999984                          1.0       both              1.0\n",
+       "4         0001326160            5416                 duke energy corp                                    NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
        "5         0000030371           54905       duke energy carolinas, llc              Duke Energy Carolinas LLC      1      17790.0       5558.0           0.999987                          2.0       both              1.0\n",
        "6         0000869446           57140      berkshire realty co inc /de  Berkshire Wind Power Cooperative Corp      0       7449.0       1712.0           0.001912                          0.0       both              0.0\n",
        "7         0000092122           18195                      southern co               southern co services inc      0      50964.0      17068.0           0.007216                          0.0       both              0.0\n",
        "8         0000092122           17650                      southern co                      Southern Power Co      0      50963.0      17089.0           0.034232                          0.0       both              0.0\n",
        "9         0000075488           14328        pacific gas & electric co                                    NaN      1      41598.0      13933.0           0.999948                          2.0       both              1.0\n",
        "10        0001031296            6526                 firstenergy corp                            FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
-       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986542                          0.0       both              1.0\n",
-       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0      21579.0       6763.0           0.085466                          0.0       both              0.0\n",
+       "11        0001031296           54776                 firstenergy corp    FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
+       "12        0001031296            6458                 firstenergy corp                  First Energy Services      0      21579.0       6763.0           0.085467                          0.0       both              0.0\n",
        "13        0001031296           32208                 firstenergy corp                      First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
        "14        0000100122           24211         tucson electric power co                                    NaN      1      55725.0      18901.0           1.000000                          2.0       both              1.0\n",
        "15        0000096271           18454                tampa electric co                                    NaN      1      53604.0      18180.0           0.991059                          2.0       both              1.0\n",
        "16        0000715957            5248             dominion energy, inc                                    NaN      1      17484.0       5386.0           0.999985                          2.0       both              1.0\n",
        "17        0001013871           59883                  nrg energy, inc     NRG Energy Gas & Wind Holdings Inc      0      40084.0      13240.0           0.300165                          0.0       both              0.0\n",
-       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1      40084.0      13243.0           0.999813                          2.0       both              1.0\n",
+       "18        0001013871           13377                   nrg energy inc                         NRG Energy Inc      1      40084.0      13243.0           0.999834                          2.0       both              1.0\n",
        "19        0000788816           13994            oglethorpe power corp                                    NaN      1      40576.0      13515.0           1.000000                          2.0       both              1.0\n",
        "20        0000018675            3266           central maine power co                                    NaN      1      10876.0       3424.0           1.000000                          2.0       both              1.0\n",
-       "21        0001032208           61296                    sempra energy                      Sempra Generation      1      49303.0      16270.0           0.559071                          0.0       both              0.0\n",
+       "21        0001032208           61296                    sempra energy                      Sempra Generation      1      49303.0      16270.0           0.559072                          0.0       both              0.0\n",
        "22        0000004904             488   american electric power co inc            American Electric Power Inc      1       2927.0        793.0           0.996076                          2.0       both              1.0\n",
        "23        0000715957            5248             dominion energy, inc                   Dominion Energy Inc.      1      17484.0       5386.0           0.999985                          2.0       both              1.0"
       ]
      },
-     "execution_count": 52,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2712,7 +2712,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 47,
    "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea",
    "metadata": {},
    "outputs": [],
@@ -2728,7 +2728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 48,
    "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe",
    "metadata": {},
    "outputs": [
@@ -2738,7 +2738,7 @@
        "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)"
       ]
      },
-     "execution_count": 54,
+     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2749,7 +2749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 49,
    "id": "08932be5-b90c-440d-9efb-156cb4d63c93",
    "metadata": {},
    "outputs": [
@@ -2799,7 +2799,7 @@
        "Positive                   3                  13"
       ]
      },
-     "execution_count": 55,
+     "execution_count": 49,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2814,7 +2814,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 50,
    "id": "025c80e9-5055-4eaa-a873-38b910cd7f94",
    "metadata": {},
    "outputs": [],
@@ -2824,7 +2824,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 51,
    "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768",
    "metadata": {},
    "outputs": [
@@ -2872,7 +2872,7 @@
        "      <td>1</td>\n",
        "      <td>17793.0</td>\n",
        "      <td>5564.0</td>\n",
-       "      <td>0.927293</td>\n",
+       "      <td>0.927294</td>\n",
        "      <td>2.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -2900,7 +2900,7 @@
        "      <td>0</td>\n",
        "      <td>21579.0</td>\n",
        "      <td>6780.0</td>\n",
-       "      <td>0.986542</td>\n",
+       "      <td>0.986543</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>1.0</td>\n",
@@ -2928,7 +2928,7 @@
        "      <td>1</td>\n",
        "      <td>49303.0</td>\n",
        "      <td>16270.0</td>\n",
-       "      <td>0.559071</td>\n",
+       "      <td>0.559072</td>\n",
        "      <td>0.0</td>\n",
        "      <td>both</td>\n",
        "      <td>0.0</td>\n",
@@ -2939,14 +2939,14 @@
       ],
       "text/plain": [
        "   central_index_key  utility_id_eia  sec_company_name                     eia_company_name  match  record_id_l  record_id_r  match_probability  gamma_company_name_no_legal     _merge  predicted_match\n",
-       "4         0001326160            5416  duke energy corp                                  NaN      1      17793.0       5564.0           0.927293                          2.0       both              0.0\n",
+       "4         0001326160            5416  duke energy corp                                  NaN      1      17793.0       5564.0           0.927294                          2.0       both              0.0\n",
        "10        0001031296            6526  firstenergy corp                          FirstEnergy      0      21579.0       6776.0           0.999998                          2.0       both              1.0\n",
-       "11        0001031296           54776  firstenergy corp  FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986542                          0.0       both              1.0\n",
+       "11        0001031296           54776  firstenergy corp  FirstEnergy Nuclear Generation Corp      0      21579.0       6780.0           0.986543                          0.0       both              1.0\n",
        "13        0001031296           32208  firstenergy corp                    First Energy Corp      1          NaN          NaN                NaN                          NaN  left_only              0.0\n",
-       "21        0001032208           61296     sempra energy                    Sempra Generation      1      49303.0      16270.0           0.559071                          0.0       both              0.0"
+       "21        0001032208           61296     sempra energy                    Sempra Generation      1      49303.0      16270.0           0.559072                          0.0       both              0.0"
       ]
      },
-     "execution_count": 57,
+     "execution_count": 51,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2957,7 +2957,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 52,
    "id": "c425a676-aa6e-4d8f-b814-931da392c2ff",
    "metadata": {},
    "outputs": [],
@@ -3073,7 +3073,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 53,
    "id": "92172e2f-39ba-49e3-8312-98597256ca4f",
    "metadata": {},
    "outputs": [],
@@ -3089,7 +3089,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 54,
    "id": "07ca81ae-1b26-4cd3-ade6-75381028028a",
    "metadata": {},
    "outputs": [
@@ -3099,7 +3099,7 @@
        "534"
       ]
      },
-     "execution_count": 59,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3118,7 +3118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 55,
    "id": "361b3e30-e823-4137-9062-6a00eae537fe",
    "metadata": {},
    "outputs": [
@@ -3191,7 +3191,7 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>274760</th>\n",
-       "      <td>29.211012</td>\n",
+       "      <td>29.211020</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -3202,29 +3202,29 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.033231</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.028877</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>161 wellington rd</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>34.184780</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>33.278930</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>brattleboro</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000086</td>\n",
        "      <td>0.000086</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>78.327981</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>77.162949</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>FBRMRK</td>\n",
        "      <td>0</td>\n",
@@ -3237,7 +3237,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>340414</th>\n",
-       "      <td>27.884365</td>\n",
+       "      <td>27.884373</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -3248,29 +3248,29 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.049847</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.043316</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>520 francis st</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>mo</td>\n",
        "      <td>mo</td>\n",
        "      <td>1</td>\n",
        "      <td>0.010118</td>\n",
        "      <td>0.010118</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>5.192099</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>5.054515</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>st joseph</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000049</td>\n",
        "      <td>0.000049</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>137.073967</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>135.035162</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>ST JSF LT ANT PWR</td>\n",
        "      <td>0</td>\n",
@@ -3283,7 +3283,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>165487</th>\n",
-       "      <td>27.757338</td>\n",
+       "      <td>27.757345</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -3294,29 +3294,29 @@
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.049847</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.043316</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>one clarks is</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000024</td>\n",
        "      <td>0.000024</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.467987</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.454618</td>\n",
        "      <td>wi</td>\n",
        "      <td>wi</td>\n",
        "      <td>1</td>\n",
        "      <td>0.008840</td>\n",
        "      <td>0.008840</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>5.943112</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>5.785628</td>\n",
        "      <td>wausau</td>\n",
        "      <td>wausau</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>109.659173</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>108.028129</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>WS PPR MLS</td>\n",
        "      <td>0</td>\n",
@@ -3329,7 +3329,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>241593</th>\n",
-       "      <td>27.526514</td>\n",
+       "      <td>27.526521</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -3340,29 +3340,29 @@
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.033231</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.028877</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>163 acorn ln</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.311992</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.303079</td>\n",
        "      <td>vt</td>\n",
        "      <td>vt</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001537</td>\n",
        "      <td>0.001537</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>34.184780</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>33.278930</td>\n",
        "      <td>colchester</td>\n",
        "      <td>colchester</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000183</td>\n",
        "      <td>0.000183</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>36.553058</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>36.009376</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>KRN MNTN PWR</td>\n",
        "      <td>0</td>\n",
@@ -3375,7 +3375,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>163815</th>\n",
-       "      <td>27.519606</td>\n",
+       "      <td>27.519613</td>\n",
        "      <td>1.000000</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
@@ -3386,29 +3386,29 @@
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>415263.133269</td>\n",
-       "      <td>0.016616</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.014439</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>33 third st se</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.000037</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.311992</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.303079</td>\n",
        "      <td>sd</td>\n",
        "      <td>sd</td>\n",
        "      <td>1</td>\n",
        "      <td>0.001930</td>\n",
        "      <td>0.001930</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>27.217182</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>26.495963</td>\n",
        "      <td>huron</td>\n",
        "      <td>huron</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000073</td>\n",
        "      <td>0.000073</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>91.382644</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>90.023441</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>NR0WSTRN PBLK SRFS</td>\n",
        "      <td>0</td>\n",
@@ -3466,17 +3466,17 @@
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1483</th>\n",
-       "      <td>4.337121</td>\n",
+       "      <th>218776</th>\n",
+       "      <td>4.337127</td>\n",
        "      <td>0.952856</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>58004</td>\n",
-       "      <td>17611</td>\n",
-       "      <td>vistacare</td>\n",
-       "      <td>stirling energy systems solar three</td>\n",
+       "      <td>32941</td>\n",
+       "      <td>17608</td>\n",
+       "      <td>lifestance health group</td>\n",
+       "      <td>stirling energy systems solar one</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
        "      <td>0.000037</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
@@ -3485,180 +3485,180 @@
        "      <td>2</td>\n",
        "      <td>0.000110</td>\n",
        "      <td>0.000110</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.103997</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.101026</td>\n",
        "      <td>az</td>\n",
        "      <td>az</td>\n",
        "      <td>1</td>\n",
        "      <td>0.012872</td>\n",
        "      <td>0.012872</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>4.081277</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>3.973129</td>\n",
        "      <td>scottsdale</td>\n",
        "      <td>scottsdale</td>\n",
        "      <td>2</td>\n",
        "      <td>0.004989</td>\n",
        "      <td>0.004989</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>1.343862</td>\n",
-       "      <td>FSTKR</td>\n",
-       "      <td>STRLNK ENRJ SSTMS SLR 0R</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>1.323874</td>\n",
+       "      <td>LFSTNS HL0 KRP</td>\n",
+       "      <td>STRLNK ENRJ SSTMS SLR ON</td>\n",
        "      <td>1</td>\n",
-       "      <td>58004</td>\n",
-       "      <td>0000787030</td>\n",
-       "      <td>0000787030</td>\n",
-       "      <td>vistacare, inc.</td>\n",
-       "      <td>17611</td>\n",
-       "      <td>56168</td>\n",
+       "      <td>32941</td>\n",
+       "      <td>0001845257</td>\n",
+       "      <td>0001845257</td>\n",
+       "      <td>lifestance health group, inc.</td>\n",
+       "      <td>17608</td>\n",
+       "      <td>56166</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>218453</th>\n",
-       "      <td>4.272157</td>\n",
-       "      <td>0.950792</td>\n",
+       "      <th>145930</th>\n",
+       "      <td>4.321967</td>\n",
+       "      <td>0.952382</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>19174</td>\n",
-       "      <td>7605</td>\n",
-       "      <td>enovis</td>\n",
-       "      <td>genon sabine delaware</td>\n",
+       "      <td>28535</td>\n",
+       "      <td>9121</td>\n",
+       "      <td>imperial holly</td>\n",
+       "      <td>imperial holly</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>477874.511191</td>\n",
+       "      <td>0.043316</td>\n",
+       "      <td>one imperial sq ste 200</td>\n",
+       "      <td>p o box 9</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.000012</td>\n",
-       "      <td>0.986046</td>\n",
+       "      <td>0.000024</td>\n",
+       "      <td>0.000159</td>\n",
+       "      <td>0.881656</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>2711 centerville rd</td>\n",
-       "      <td>2711 centerville rd</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>0.000061</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.187195</td>\n",
-       "      <td>de</td>\n",
-       "      <td>de</td>\n",
+       "      <td>tx</td>\n",
+       "      <td>tx</td>\n",
        "      <td>1</td>\n",
-       "      <td>0.011717</td>\n",
-       "      <td>0.011717</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>4.483838</td>\n",
-       "      <td>wilmington</td>\n",
-       "      <td>wilmington</td>\n",
-       "      <td>2</td>\n",
-       "      <td>0.010321</td>\n",
-       "      <td>0.010321</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>0.649640</td>\n",
-       "      <td>ENFS</td>\n",
-       "      <td>JNN SBN TLWR</td>\n",
+       "      <td>0.079841</td>\n",
+       "      <td>0.079841</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>0.640571</td>\n",
+       "      <td>sugar land</td>\n",
+       "      <td>sugarland</td>\n",
        "      <td>1</td>\n",
-       "      <td>19174</td>\n",
-       "      <td>0001420800</td>\n",
-       "      <td>0001420800</td>\n",
-       "      <td>enovis corp</td>\n",
-       "      <td>7605</td>\n",
-       "      <td>56922</td>\n",
+       "      <td>0.000355</td>\n",
+       "      <td>0.000098</td>\n",
+       "      <td>45.415672</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>IMPRL HL</td>\n",
+       "      <td>IMPRL HL</td>\n",
+       "      <td>0</td>\n",
+       "      <td>28535</td>\n",
+       "      <td>0000831327</td>\n",
+       "      <td>0000831327</td>\n",
+       "      <td>imperial holly corp</td>\n",
+       "      <td>9121</td>\n",
+       "      <td>9223</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1055</th>\n",
-       "      <td>4.272157</td>\n",
+       "      <th>6194</th>\n",
+       "      <td>4.272164</td>\n",
        "      <td>0.950792</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>1650</td>\n",
-       "      <td>16368</td>\n",
-       "      <td>aisystems</td>\n",
-       "      <td>shannon wind</td>\n",
+       "      <td>32403</td>\n",
+       "      <td>16195</td>\n",
+       "      <td>lease investment flight trust</td>\n",
+       "      <td>se solar trust v c</td>\n",
        "      <td>0</td>\n",
-       "      <td>0.000024</td>\n",
-       "      <td>0.000024</td>\n",
+       "      <td>0.000012</td>\n",
+       "      <td>0.000012</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>2711 centerville rd</td>\n",
-       "      <td>2711 centerville rd</td>\n",
+       "      <td>1100 north market st</td>\n",
+       "      <td>1100 north market st</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.187195</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.181847</td>\n",
        "      <td>de</td>\n",
        "      <td>de</td>\n",
        "      <td>1</td>\n",
        "      <td>0.011717</td>\n",
        "      <td>0.011717</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>4.483838</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>4.365022</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>2</td>\n",
        "      <td>0.010321</td>\n",
        "      <td>0.010321</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>0.649640</td>\n",
-       "      <td>ASSTMS</td>\n",
-       "      <td>XNN WNT</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>0.639977</td>\n",
+       "      <td>LS INFSTMNT FLT TRST</td>\n",
+       "      <td>S SLR TRST F K</td>\n",
        "      <td>1</td>\n",
-       "      <td>1650</td>\n",
-       "      <td>0001328769</td>\n",
-       "      <td>0001328769</td>\n",
-       "      <td>aisystems, inc.</td>\n",
-       "      <td>16368</td>\n",
-       "      <td>58872</td>\n",
+       "      <td>32403</td>\n",
+       "      <td>0001158389</td>\n",
+       "      <td>0001158389</td>\n",
+       "      <td>lease investment flight trust</td>\n",
+       "      <td>16195</td>\n",
+       "      <td>56900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>7216</th>\n",
-       "      <td>4.272157</td>\n",
+       "      <th>1135</th>\n",
+       "      <td>4.272164</td>\n",
        "      <td>0.950792</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
-       "      <td>32403</td>\n",
-       "      <td>14089</td>\n",
-       "      <td>lease investment flight trust</td>\n",
-       "      <td>pasadena statutory trust</td>\n",
+       "      <td>22415</td>\n",
+       "      <td>7605</td>\n",
+       "      <td>fresenius kabi pharmaceuticals holding</td>\n",
+       "      <td>genon sabine delaware</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.986046</td>\n",
        "      <td>1.000000</td>\n",
-       "      <td>1100 north market st</td>\n",
-       "      <td>1100 north market st</td>\n",
+       "      <td>2711 centerville rd</td>\n",
+       "      <td>2711 centerville rd</td>\n",
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.187195</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.181847</td>\n",
        "      <td>de</td>\n",
        "      <td>de</td>\n",
        "      <td>1</td>\n",
        "      <td>0.011717</td>\n",
        "      <td>0.011717</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>4.483838</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>4.365022</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>2</td>\n",
        "      <td>0.010321</td>\n",
        "      <td>0.010321</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>0.649640</td>\n",
-       "      <td>LS INFSTMNT FLT TRST</td>\n",
-       "      <td>PSTN STTTR TRST</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>0.639977</td>\n",
+       "      <td>FRSNS KB FRMSTKLS HLTNK</td>\n",
+       "      <td>JNN SBN TLWR</td>\n",
        "      <td>1</td>\n",
-       "      <td>32403</td>\n",
-       "      <td>0001158389</td>\n",
-       "      <td>0001158389</td>\n",
-       "      <td>lease investment flight trust</td>\n",
-       "      <td>14089</td>\n",
-       "      <td>61235</td>\n",
+       "      <td>22415</td>\n",
+       "      <td>0001439449</td>\n",
+       "      <td>0001439449</td>\n",
+       "      <td>fresenius kabi pharmaceuticals holding, inc.</td>\n",
+       "      <td>7605</td>\n",
+       "      <td>56922</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>6113</th>\n",
-       "      <td>4.272157</td>\n",
+       "      <th>9350</th>\n",
+       "      <td>4.272164</td>\n",
        "      <td>0.950792</td>\n",
        "      <td>__splink__input_table_0</td>\n",
        "      <td>__splink__input_table_1</td>\n",
        "      <td>1626</td>\n",
-       "      <td>16195</td>\n",
+       "      <td>14089</td>\n",
        "      <td>airplanes us trust</td>\n",
-       "      <td>se solar trust v c</td>\n",
+       "      <td>pasadena statutory trust</td>\n",
        "      <td>0</td>\n",
        "      <td>0.000012</td>\n",
        "      <td>0.000012</td>\n",
@@ -3669,31 +3669,31 @@
        "      <td>2</td>\n",
        "      <td>0.000061</td>\n",
        "      <td>0.000061</td>\n",
-       "      <td>9605.781694</td>\n",
-       "      <td>0.187195</td>\n",
+       "      <td>9888.266177</td>\n",
+       "      <td>0.181847</td>\n",
        "      <td>de</td>\n",
        "      <td>de</td>\n",
        "      <td>1</td>\n",
        "      <td>0.011717</td>\n",
        "      <td>0.011717</td>\n",
-       "      <td>15.445559</td>\n",
-       "      <td>4.483838</td>\n",
+       "      <td>15.866015</td>\n",
+       "      <td>4.365022</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>wilmington</td>\n",
        "      <td>2</td>\n",
        "      <td>0.010321</td>\n",
        "      <td>0.010321</td>\n",
-       "      <td>102.014123</td>\n",
-       "      <td>0.649640</td>\n",
+       "      <td>103.554689</td>\n",
+       "      <td>0.639977</td>\n",
        "      <td>ARPLNS US TRST</td>\n",
-       "      <td>S SLR TRST F K</td>\n",
+       "      <td>PSTN STTTR TRST</td>\n",
        "      <td>1</td>\n",
        "      <td>1626</td>\n",
        "      <td>0001004540</td>\n",
        "      <td>0001004540</td>\n",
        "      <td>airplanes us trust</td>\n",
-       "      <td>16195</td>\n",
-       "      <td>56900</td>\n",
+       "      <td>14089</td>\n",
+       "      <td>61235</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -3701,23 +3701,23 @@
        "</div>"
       ],
       "text/plain": [
-       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r        company_name_no_legal_l              company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal      street_address_l      street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city company_name_mphone_l     company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                company_name_raw  record_id_y  utility_id_eia\n",
-       "274760     29.211012           1.000000  __splink__input_table_0  __splink__input_table_1        20588         6741                      fibermark                            fibermark                            2                    0.000037                    0.000037             415263.133269                         0.033231     161 wellington rd     161 wellington rd                     2             0.000024             0.000024        9605.781694                  0.467987      vt      vt            1    0.001537    0.001537  15.445559        34.184780  brattleboro  brattleboro           2   0.000086   0.000086  102.014123       78.327981                FBRMRK                    FBRMRK         0        20588     0000887591        0000887591                   fibermark inc         6741            6309\n",
-       "340414     27.884365           1.000000  __splink__input_table_0  __splink__input_table_1        51567        17450      st joseph light and power            st joseph light and power                            2                    0.000024                    0.000024             415263.133269                         0.049847        520 francis st        520 francis st                     2             0.000024             0.000024        9605.781694                  0.467987      mo      mo            1    0.010118    0.010118  15.445559         5.192099    st joseph    st joseph           2   0.000049   0.000049  102.014123      137.073967     ST JSF LT ANT PWR         ST JSF LT ANT PWR         0        51567     0000086251        0000086251      st joseph light & power co        17450           17881\n",
-       "165487     27.757338           1.000000  __splink__input_table_0  __splink__input_table_1        58842        19906             wausau paper mills                   wausau paper mills                            2                    0.000024                    0.000024             415263.133269                         0.049847         one clarks is         one clarks is                     2             0.000024             0.000024        9605.781694                  0.467987      wi      wi            1    0.008840    0.008840  15.445559         5.943112       wausau       wausau           2   0.000061   0.000061  102.014123      109.659173            WS PPR MLS                WS PPR MLS         0        58842     0000105076        0000105076           wausau paper mills co        19906           20190\n",
-       "241593     27.526514           1.000000  __splink__input_table_0  __splink__input_table_1        24650         8047           green mountain power                 green mountain power                            2                    0.000037                    0.000037             415263.133269                         0.033231          163 acorn ln          163 acorn ln                     2             0.000037             0.000037        9605.781694                  0.311992      vt      vt            1    0.001537    0.001537  15.445559        34.184780   colchester   colchester           2   0.000183   0.000183  102.014123       36.553058          KRN MNTN PWR              KRN MNTN PWR         0        24650     0000043704        0000043704       green mountain power corp         8047            7601\n",
-       "163815     27.519606           1.000000  __splink__input_table_0  __splink__input_table_1        39816        13109    northwestern public service          northwestern public service                            2                    0.000073                    0.000073             415263.133269                         0.016616        33 third st se        33 third st se                     2             0.000037             0.000037        9605.781694                  0.311992      sd      sd            1    0.001930    0.001930  15.445559        27.217182        huron        huron           2   0.000073   0.000073  102.014123       91.382644    NR0WSTRN PBLK SRFS        NR0WSTRN PBLK SRFS         0        39816     0000073088        0000073088  northwestern public service co        13109           13809\n",
-       "...              ...                ...                      ...                      ...          ...          ...                            ...                                  ...                          ...                         ...                         ...                       ...                              ...                   ...                   ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...         ...             ...                   ...                       ...       ...          ...            ...               ...                             ...          ...             ...\n",
-       "1483        4.337121           0.952856  __splink__input_table_0  __splink__input_table_1        58004        17611                      vistacare  stirling energy systems solar three                            0                    0.000024                    0.000037                  0.986046                         1.000000  4800 n scottsdale rd  4800 n scottsdale rd                     2             0.000110             0.000110        9605.781694                  0.103997      az      az            1    0.012872    0.012872  15.445559         4.081277   scottsdale   scottsdale           2   0.004989   0.004989  102.014123        1.343862                 FSTKR  STRLNK ENRJ SSTMS SLR 0R         1        58004     0000787030        0000787030                 vistacare, inc.        17611           56168\n",
-       "218453      4.272157           0.950792  __splink__input_table_0  __splink__input_table_1        19174         7605                         enovis                genon sabine delaware                            0                    0.000012                    0.000012                  0.986046                         1.000000   2711 centerville rd   2711 centerville rd                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640                  ENFS              JNN SBN TLWR         1        19174     0001420800        0001420800                     enovis corp         7605           56922\n",
-       "1055        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1         1650        16368                      aisystems                         shannon wind                            0                    0.000024                    0.000024                  0.986046                         1.000000   2711 centerville rd   2711 centerville rd                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640                ASSTMS                   XNN WNT         1         1650     0001328769        0001328769                 aisystems, inc.        16368           58872\n",
-       "7216        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1        32403        14089  lease investment flight trust             pasadena statutory trust                            0                    0.000012                    0.000012                  0.986046                         1.000000  1100 north market st  1100 north market st                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640  LS INFSTMNT FLT TRST           PSTN STTTR TRST         1        32403     0001158389        0001158389   lease investment flight trust        14089           61235\n",
-       "6113        4.272157           0.950792  __splink__input_table_0  __splink__input_table_1         1626        16195             airplanes us trust                   se solar trust v c                            0                    0.000012                    0.000012                  0.986046                         1.000000  1100 north market st  1100 north market st                     2             0.000061             0.000061        9605.781694                  0.187195      de      de            1    0.011717    0.011717  15.445559         4.483838   wilmington   wilmington           2   0.010321   0.010321  102.014123        0.649640        ARPLNS US TRST            S SLR TRST F K         1         1626     0001004540        0001004540              airplanes us trust        16195           56900\n",
+       "        match_weight  match_probability         source_dataset_l         source_dataset_r  record_id_l  record_id_r                 company_name_no_legal_l            company_name_no_legal_r  gamma_company_name_no_legal  tf_company_name_no_legal_l  tf_company_name_no_legal_r  bf_company_name_no_legal  bf_tf_adj_company_name_no_legal         street_address_l      street_address_r  gamma_street_address  tf_street_address_l  tf_street_address_r  bf_street_address  bf_tf_adj_street_address state_l state_r  gamma_state  tf_state_l  tf_state_r   bf_state  bf_tf_adj_state       city_l       city_r  gamma_city  tf_city_l  tf_city_r     bf_city  bf_tf_adj_city    company_name_mphone_l     company_name_mphone_r match_key  record_id_x sec_company_id central_index_key                              company_name_raw  record_id_y  utility_id_eia\n",
+       "274760     29.211020           1.000000  __splink__input_table_0  __splink__input_table_1        20588         6741                               fibermark                          fibermark                            2                    0.000037                    0.000037             477874.511191                         0.028877        161 wellington rd     161 wellington rd                     2             0.000024             0.000024        9888.266177                  0.454618      vt      vt            1    0.001537    0.001537  15.866015        33.278930  brattleboro  brattleboro           2   0.000086   0.000086  103.554689       77.162949                   FBRMRK                    FBRMRK         0        20588     0000887591        0000887591                                 fibermark inc         6741            6309\n",
+       "340414     27.884373           1.000000  __splink__input_table_0  __splink__input_table_1        51567        17450               st joseph light and power          st joseph light and power                            2                    0.000024                    0.000024             477874.511191                         0.043316           520 francis st        520 francis st                     2             0.000024             0.000024        9888.266177                  0.454618      mo      mo            1    0.010118    0.010118  15.866015         5.054515    st joseph    st joseph           2   0.000049   0.000049  103.554689      135.035162        ST JSF LT ANT PWR         ST JSF LT ANT PWR         0        51567     0000086251        0000086251                    st joseph light & power co        17450           17881\n",
+       "165487     27.757345           1.000000  __splink__input_table_0  __splink__input_table_1        58842        19906                      wausau paper mills                 wausau paper mills                            2                    0.000024                    0.000024             477874.511191                         0.043316            one clarks is         one clarks is                     2             0.000024             0.000024        9888.266177                  0.454618      wi      wi            1    0.008840    0.008840  15.866015         5.785628       wausau       wausau           2   0.000061   0.000061  103.554689      108.028129               WS PPR MLS                WS PPR MLS         0        58842     0000105076        0000105076                         wausau paper mills co        19906           20190\n",
+       "241593     27.526521           1.000000  __splink__input_table_0  __splink__input_table_1        24650         8047                    green mountain power               green mountain power                            2                    0.000037                    0.000037             477874.511191                         0.028877             163 acorn ln          163 acorn ln                     2             0.000037             0.000037        9888.266177                  0.303079      vt      vt            1    0.001537    0.001537  15.866015        33.278930   colchester   colchester           2   0.000183   0.000183  103.554689       36.009376             KRN MNTN PWR              KRN MNTN PWR         0        24650     0000043704        0000043704                     green mountain power corp         8047            7601\n",
+       "163815     27.519613           1.000000  __splink__input_table_0  __splink__input_table_1        39816        13109             northwestern public service        northwestern public service                            2                    0.000073                    0.000073             477874.511191                         0.014439           33 third st se        33 third st se                     2             0.000037             0.000037        9888.266177                  0.303079      sd      sd            1    0.001930    0.001930  15.866015        26.495963        huron        huron           2   0.000073   0.000073  103.554689       90.023441       NR0WSTRN PBLK SRFS        NR0WSTRN PBLK SRFS         0        39816     0000073088        0000073088                northwestern public service co        13109           13809\n",
+       "...              ...                ...                      ...                      ...          ...          ...                                     ...                                ...                          ...                         ...                         ...                       ...                              ...                      ...                   ...                   ...                  ...                  ...                ...                       ...     ...     ...          ...         ...         ...        ...              ...          ...          ...         ...        ...        ...         ...             ...                      ...                       ...       ...          ...            ...               ...                                           ...          ...             ...\n",
+       "218776      4.337127           0.952856  __splink__input_table_0  __splink__input_table_1        32941        17608                 lifestance health group  stirling energy systems solar one                            0                    0.000012                    0.000037                  0.986046                         1.000000     4800 n scottsdale rd  4800 n scottsdale rd                     2             0.000110             0.000110        9888.266177                  0.101026      az      az            1    0.012872    0.012872  15.866015         3.973129   scottsdale   scottsdale           2   0.004989   0.004989  103.554689        1.323874           LFSTNS HL0 KRP  STRLNK ENRJ SSTMS SLR ON         1        32941     0001845257        0001845257                 lifestance health group, inc.        17608           56166\n",
+       "145930      4.321967           0.952382  __splink__input_table_0  __splink__input_table_1        28535         9121                          imperial holly                     imperial holly                            2                    0.000024                    0.000024             477874.511191                         0.043316  one imperial sq ste 200             p o box 9                     0             0.000024             0.000159           0.881656                  1.000000      tx      tx            1    0.079841    0.079841  15.866015         0.640571   sugar land    sugarland           1   0.000355   0.000098   45.415672        1.000000                 IMPRL HL                  IMPRL HL         0        28535     0000831327        0000831327                           imperial holly corp         9121            9223\n",
+       "6194        4.272164           0.950792  __splink__input_table_0  __splink__input_table_1        32403        16195           lease investment flight trust                 se solar trust v c                            0                    0.000012                    0.000012                  0.986046                         1.000000     1100 north market st  1100 north market st                     2             0.000061             0.000061        9888.266177                  0.181847      de      de            1    0.011717    0.011717  15.866015         4.365022   wilmington   wilmington           2   0.010321   0.010321  103.554689        0.639977     LS INFSTMNT FLT TRST            S SLR TRST F K         1        32403     0001158389        0001158389                 lease investment flight trust        16195           56900\n",
+       "1135        4.272164           0.950792  __splink__input_table_0  __splink__input_table_1        22415         7605  fresenius kabi pharmaceuticals holding              genon sabine delaware                            0                    0.000012                    0.000012                  0.986046                         1.000000      2711 centerville rd   2711 centerville rd                     2             0.000061             0.000061        9888.266177                  0.181847      de      de            1    0.011717    0.011717  15.866015         4.365022   wilmington   wilmington           2   0.010321   0.010321  103.554689        0.639977  FRSNS KB FRMSTKLS HLTNK              JNN SBN TLWR         1        22415     0001439449        0001439449  fresenius kabi pharmaceuticals holding, inc.         7605           56922\n",
+       "9350        4.272164           0.950792  __splink__input_table_0  __splink__input_table_1         1626        14089                      airplanes us trust           pasadena statutory trust                            0                    0.000012                    0.000012                  0.986046                         1.000000     1100 north market st  1100 north market st                     2             0.000061             0.000061        9888.266177                  0.181847      de      de            1    0.011717    0.011717  15.866015         4.365022   wilmington   wilmington           2   0.010321   0.010321  103.554689        0.639977           ARPLNS US TRST           PSTN STTTR TRST         1         1626     0001004540        0001004540                            airplanes us trust        14089           61235\n",
        "\n",
        "[534 rows x 43 columns]"
       ]
      },
-     "execution_count": 60,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3728,50 +3728,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
-   "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "out_df = sec_df.merge(\n",
-    "    one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n",
-    "    how=\"left\",\n",
-    "    on=\"sec_company_id\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "utility_id_eia\n",
-       "True     59895\n",
-       "False     1131\n",
-       "Name: count, dtype: int64"
-      ]
-     },
-     "execution_count": 66,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "out_df.utility_id_eia.isnull().value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252",
+   "execution_count": 320,
+   "id": "4633e3f8-f0a3-4109-ae66-b3e898059ed7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "len(one_to_one_preds"
+    "one_to_one_preds.to_parquet(\"one_to_one_preds.parquet\")"
    ]
   }
  ],